ruff linting fixes (#343)

* ruff linting fixes

* move imports back down to where they're re-exported
This commit is contained in:
Gretchen Leigh Miller 2025-03-07 16:03:35 -08:00 committed by GitHub
parent 6f011cc6c8
commit f64db214d4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 155 additions and 190 deletions

View File

@ -17,10 +17,13 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import datetime
import logging
import threading
from importlib.metadata import version as _version
import structlog
import urlcanon
__version__ = _version("brozzler")
@ -91,7 +94,7 @@ def _logging_handler_handle(self, record):
finally:
try:
self.release()
except:
except: # noqa: E722
pass
return rv
@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None):
`js-templates/`. Defaults to brozzler dir.
"""
import os
import string
import yaml
@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
"""
Returns the javascript behavior string populated with template_parameters.
"""
import json
import re
logger = structlog.get_logger(logger_name=__name__)
@ -194,8 +195,6 @@ class ThreadExceptionGate:
return "<ThreadExceptionGate(%s)>" % self.thread
import threading
_thread_exception_gates = {}
_thread_exception_gates_lock = threading.Lock()
@ -225,7 +224,7 @@ def thread_exception_gate(thread=None):
thread = threading.current_thread()
with _thread_exception_gates_lock:
if not thread in _thread_exception_gates:
if thread not in _thread_exception_gates:
_thread_exception_gates[thread] = ThreadExceptionGate(thread)
return _thread_exception_gates[thread]
@ -252,7 +251,6 @@ def thread_raise(thread, exctype):
"""
import ctypes
import inspect
import threading
import structlog
@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None):
return _jinja2_env
import urlcanon
def _remove_query(url):
url.question_mark = b""
url.query = b""
@ -403,13 +398,10 @@ def suggest_default_chrome_exe():
return "chromium-browser"
import datetime
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.robots import is_permitted_by_robots
from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402
from brozzler.robots import is_permitted_by_robots # noqa: E402
__all__ = [
"is_permitted_by_robots",
@ -422,22 +414,25 @@ __all__ = [
"suggest_default_chrome_exe",
]
# TODO try using importlib.util.find_spec to test for dependency presence
# rather than try/except on import.
# See https://docs.astral.sh/ruff/rules/unused-import/#example
try:
import doublethink
import doublethink # noqa: F401
# All of these imports use doublethink for real and are unsafe
# to do if doublethink is unavailable.
from brozzler.frontier import RethinkDbFrontier
from brozzler.frontier import RethinkDbFrontier # noqa: F401
from brozzler.model import (
InvalidJobConf,
Job,
Page,
Site,
new_job,
new_job_file,
new_site,
InvalidJobConf, # noqa: F401
Job, # noqa: F401
Page, # noqa: F401
Site, # noqa: F401
new_job, # noqa: F401
new_job_file, # noqa: F401
new_site, # noqa: F401
)
from brozzler.worker import BrozzlerWorker
from brozzler.worker import BrozzlerWorker # noqa: F401
__all__.extend(
[

View File

@ -18,7 +18,6 @@ limitations under the License.
import base64
import datetime
import itertools
import json
import logging
import socket
@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread):
def _on_message(self, websock, message):
try:
self._handle_message(websock, message)
except:
except: # noqa: E722
self.logger.exception(
"uncaught exception in _handle_message",
message=message,
@ -430,7 +429,7 @@ class Browser:
self.logger.info("shutting down websocket connection")
try:
self.websock.close()
except BaseException as e:
except BaseException:
self.logger.exception(
"exception closing websocket", websocket=self.websock
)
@ -458,7 +457,7 @@ class Browser:
)
self.websock_url = None
except:
except: # noqa: E722
self.logger.exception("problem stopping")
def is_running(self):
@ -628,7 +627,7 @@ class Browser:
jpeg_bytes = self.screenshot(full_page)
on_screenshot(jpeg_bytes)
return
except BrowsingTimeout as e:
except BrowsingTimeout:
self.logger.exception("attempt %s/3", i + 1)
def visit_hashtags(self, page_url, hashtags, outlinks):
@ -807,12 +806,12 @@ class Browser:
if (
msg
and "result" in msg
and not ("exceptionDetails" in msg["result"])
and "exceptionDetails" not in msg["result"]
and not (
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
)
and "result" in msg["result"]
and type(msg["result"]["result"]["value"]) == bool
and isinstance(msg["result"]["result"]["value"], bool)
and msg["result"]["result"]["value"]
):
self.logger.info("behavior decided it has finished")

View File

@ -265,7 +265,7 @@ class Chrome:
return url
except brozzler.ShutdownRequested:
raise
except Exception as e:
except Exception:
if time.time() - self._last_warning > 30:
url_logger.warning(
"problem accessing url (will keep trying until timeout)",
@ -325,7 +325,7 @@ class Chrome:
self.logger.debug(
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
)
except:
except: # noqa: E722
self.logger.exception("unexpected exception")
def stop(self):
@ -378,7 +378,7 @@ class Chrome:
self.chrome_process.stderr.close()
try:
self._home_tmpdir.cleanup()
except:
except: # noqa: E722
self.logger.exception(
"exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
)

View File

@ -23,12 +23,10 @@ import datetime
import json
import logging
import os
import re
import signal
import string
import sys
import threading
import time
import traceback
import warnings
@ -397,9 +395,9 @@ def brozzle_page(argv=None):
enable_youtube_dl=not worker._skip_youtube_dl,
)
logger.info("outlinks", outlinks=sorted(outlinks))
except brozzler.ReachedLimit as e:
except brozzler.ReachedLimit:
logger.exception("reached limit")
except brozzler.PageInterstitialShown as e:
except brozzler.PageInterstitialShown:
logger.exception("page interstitial shown")
finally:
browser.stop()
@ -661,7 +659,7 @@ def brozzler_worker(argv=None):
logger.info(
"dumping state (caught signal)\n%s", signal=signum, state=state_strs
)
except BaseException as e:
except BaseException:
logger.exception("exception dumping state")
finally:
signal.signal(signal.SIGQUIT, dump_state)
@ -672,11 +670,11 @@ def brozzler_worker(argv=None):
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
skip_av_seeds = {int(line) for line in skips.readlines()}
logger.info(
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
)
except Exception as e:
except Exception:
skip_av_seeds = set()
logger.info("running with empty skip_av_seeds")
return skip_av_seeds
@ -686,13 +684,13 @@ def brozzler_worker(argv=None):
try:
# make list from file
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
ytdlp_proxy_endpoints = [line for line in endpoints.readlines()]
if ytdlp_proxy_endpoints:
logger.info(
"running with ytdlp proxy endpoints file",
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
)
except Exception as e:
except Exception:
ytdlp_proxy_endpoints = []
logger.info("running with empty proxy endpoints file")
return ytdlp_proxy_endpoints
@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None):
configure_logging(args)
rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
brozzler.RethinkDbFrontier(rr)
if args.job:
try:
job_id = int(args.job)

View File

@ -17,9 +17,14 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import base64
import os
import sys
import doublethink
import rethinkdb as rdb
import structlog
import yaml
logger = structlog.get_logger(logger_name=__name__)
@ -33,14 +38,6 @@ except ImportError as e:
e,
)
sys.exit(1)
import base64
import importlib
import json
import os
import doublethink
import rethinkdb as rdb
import yaml
r = rdb.RethinkDB()
@ -285,6 +282,8 @@ def root(path):
try:
import logging
import gunicorn.app.base
import gunicorn.glogging
from gunicorn.six import iteritems

View File

@ -18,10 +18,22 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import os
import signal
import socket
import socketserver
import sys
import threading
import time
import traceback
import doublethink
import structlog
import brozzler
import brozzler.cli
logger = structlog.get_logger(logger_name=__name__)
try:
@ -42,19 +54,6 @@ except ImportError as e:
exc_info=True,
)
sys.exit(1)
import argparse
import os
import signal
import socket
import socketserver
import threading
import time
import traceback
import doublethink
import brozzler
import brozzler.cli
def _build_arg_parser(argv=None):

View File

@ -16,10 +16,6 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import datetime
import random
import time
import doublethink
import rethinkdb as rdb
import structlog
@ -47,11 +43,11 @@ class RethinkDbFrontier:
db_logger = self.logger.bind(dbname=self.rr.dbname)
dbs = self.rr.db_list().run()
if not self.rr.dbname in dbs:
if self.rr.dbname not in dbs:
db_logger.info("creating rethinkdb database")
self.rr.db_create(self.rr.dbname).run()
tables = self.rr.table_list().run()
if not "sites" in tables:
if "sites" not in tables:
db_logger.info("creating rethinkdb table 'sites' in database")
self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas
@ -60,7 +56,7 @@ class RethinkDbFrontier:
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
).run()
self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables:
if "pages" not in tables:
db_logger.info("creating rethinkdb table 'pages' in database")
self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas
@ -80,7 +76,7 @@ class RethinkDbFrontier:
"least_hops",
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run()
if not "jobs" in tables:
if "jobs" not in tables:
db_logger.info("creating rethinkdb table 'jobs' in database")
self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas
@ -352,7 +348,6 @@ class RethinkDbFrontier:
site.save()
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
"utf-8"
@ -461,8 +456,8 @@ class RethinkDbFrontier:
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
# there can be many pages and each one can be very large (many videos,
# in and out of scope links, etc)
l = list(pages.values())
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
pages_list = list(pages.values())
for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)):
try:
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
reql = self.rr.table("pages").insert(batch, conflict="replace")
@ -471,8 +466,8 @@ class RethinkDbFrontier:
'conflict="replace")',
batch,
)
result = reql.run()
except Exception as e:
reql.run()
except Exception:
self.logger.exception(
"problem inserting/replacing batch of %s pages",
len(batch),

View File

@ -19,12 +19,9 @@ limitations under the License.
import base64
import copy
import datetime
import hashlib
import json
import os
import re
import time
import urllib
import uuid
import zlib
@ -61,7 +58,7 @@ class InvalidJobConf(Exception):
# debugged, I found it here. Maybe there's a better way to see it.
value = validator._errors[0].info[0][0].info[0][0].value
self.errors["bad value"] = value
except:
except: # noqa: E722
value = None
@ -122,10 +119,10 @@ def new_job(frontier, job_conf):
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
logger.info("inserting batch of %s pages", len(batch))
result = frontier.rr.table("pages").insert(batch).run()
frontier.rr.table("pages").insert(batch).run()
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
logger.info("inserting batch of %s sites", len(batch))
result = frontier.rr.table("sites").insert(batch).run()
frontier.rr.table("sites").insert(batch).run()
logger.info("job fully started", job_id=job.id)
return job
@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn):
table = "jobs"
def populate_defaults(self):
if not "status" in self:
if "status" not in self:
self.status = "ACTIVE"
if not "starts_and_stops" in self:
if "starts_and_stops" not in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("started"), "stop": self.get("finished")}
@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn):
table = "sites"
def populate_defaults(self):
if not "status" in self:
if "status" not in self:
self.status = "ACTIVE"
if not "claimed" in self:
if "claimed" not in self:
self.claimed = False
if not "last_disclaimed" in self:
if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self:
if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
if "scope" not in self:
self.scope = {}
if not "skip_ytdlp" in self:
if "skip_ytdlp" not in self:
self.skip_ytdlp = None
# backward compatibility
if "surt" in self.scope:
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"]
# backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)
if not "starts_and_stops" in self:
if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None}
@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
simple_rule_ssurts = (
rule["ssurt"]
@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn):
if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url)
if not url.scheme in (b"http", b"https"):
if url.scheme not in (b"http", b"https"):
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False
@ -390,31 +387,31 @@ class Page(doublethink.Document):
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def populate_defaults(self):
if not "retry_after" in self:
if "retry_after" not in self:
self.retry_after = None
if not "failed_attempts" in self:
if "failed_attempts" not in self:
self.failed_attempts = 0
if not "hops_from_seed" in self:
if "hops_from_seed" not in self:
self.hops_from_seed = 0
if not "hop_path" in self:
if "hop_path" not in self:
self.hop_path = None
if not "via_page_url" in self:
if "via_page_url" not in self:
self.via_page_url = None
if not "brozzle_count" in self:
if "brozzle_count" not in self:
self.brozzle_count = 0
if not "claimed" in self:
if "claimed" not in self:
self.claimed = False
if "hops_off_surt" in self and not "hops_off" in self:
if "hops_off_surt" in self and "hops_off" not in self:
self.hops_off = self.hops_off_surt
if "hops_off_surt" in self:
del self["hops_off_surt"]
if not "hops_off" in self:
if "hops_off" not in self:
self.hops_off = 0
if not "needs_robots_check" in self:
if "needs_robots_check" not in self:
self.needs_robots_check = False
if not "priority" in self:
if "priority" not in self:
self.priority = self._calc_priority()
if not "id" in self:
if "id" not in self:
self.id = self.compute_id(self.site_id, self.url)
def __str__(self):

View File

@ -18,9 +18,16 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import json
import sys
import doublethink
import rethinkdb as rdb
import structlog
import urlcanon
import brozzler
logger = structlog.get_logger(logger_name=__name__)
@ -40,14 +47,7 @@ except ImportError as e:
e,
)
sys.exit(1)
import argparse
import json
import doublethink
import rethinkdb as rdb
import urlcanon
import brozzler
r = rdb.RethinkDB()
@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object):
key = urlcanon.semantic(url).surt().decode("ascii")
# logging.debug('%s -> %s', url, key)
return key
except Exception as e:
except Exception:
return url
def replace_default_canonicalizer():
@ -221,18 +221,9 @@ def support_in_progress_warcs():
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
def __init__(self, orig_url):
import re
import six
from pywb.rewrite.wburl import WbUrl
from pywb.utils.loaders import to_native_str
from six.moves.urllib.parse import (
quote,
quote_plus,
unquote_plus,
urlsplit,
urlunsplit,
)
from six.moves.urllib.parse import quote
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query):
urlkey = to_native_str(query.key, "utf-8")
url = query.url
filter_ = query.filters
output = query.output
for rule in self.rules.iter_matching(urlkey):
m = rule.regex.search(urlkey)

View File

@ -71,7 +71,7 @@ _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None):
if not site.id in _robots_caches:
if site.id not in _robots_caches:
req_sesh = _SessionRaiseOn420()
req_sesh.verify = False # ignore cert errors
if proxy:

View File

@ -21,9 +21,7 @@ limitations under the License.
import datetime
import io
import json
import random
import socket
import tempfile
import threading
import time
import urllib.request
@ -99,9 +97,13 @@ class BrozzlerWorker:
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
# TODO try using importlib.util.find_spec to test for dependency
# presence rather than try/except on import.
# See https://docs.astral.sh/ruff/rules/unused-import/#example
# We definitely shouldn't ytdlp if the optional extra is missing
try:
import yt_dlp
import yt_dlp # noqa: F401
except ImportError:
self.logger.info(
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
@ -200,7 +202,7 @@ class BrozzlerWorker:
response = requests.get("http://%s/status" % self._proxy)
status = json.loads(response.text)
self._proxy_is_warcprox = status["role"] == "warcprox"
except Exception as e:
except Exception:
self._proxy_is_warcprox = False
self.logger.info(
"%s %s warcprox",
@ -348,13 +350,13 @@ class BrozzlerWorker:
)
metrics.brozzler_ydl_urls_checked.inc(1)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
except brozzler.ReachedLimit:
raise
except brozzler.ShutdownRequested:
raise
except brozzler.ProxyError:
raise
except brozzler.VideoExtractorError as e:
except brozzler.VideoExtractorError:
self.logger.exception("error extracting video info")
except Exception as e:
if (
@ -391,9 +393,9 @@ class BrozzlerWorker:
timeout=self.HEADER_REQUEST_TIMEOUT,
) as r:
return r.headers
except requests.exceptions.Timeout as e:
except requests.exceptions.Timeout:
url_logger.warning("Timed out trying to get headers", exc_info=True)
except requests.exceptions.RequestException as e:
except requests.exceptions.RequestException:
url_logger.warning("Failed to get headers", exc_info=True)
return {}
@ -469,7 +471,7 @@ class BrozzlerWorker:
if "content-range" in response_headers:
video["content-range"] = response_headers["content-range"]
self.logger.debug("embedded video", video=video)
if not "videos" in page:
if "videos" not in page:
page.videos = []
page.videos.append(video)
@ -598,13 +600,13 @@ class BrozzlerWorker:
site_logger.info("no pages left for site")
except brozzler.ReachedLimit as e:
self._frontier.reached_limit(site, e)
except brozzler.ReachedTimeLimit as e:
except brozzler.ReachedTimeLimit:
self._frontier.finished(site, "FINISHED_TIME_LIMIT")
except brozzler.CrawlStopped:
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
# except brozzler.browser.BrowsingAborted:
# self.logger.info("{} shut down".format(browser))
except brozzler.ProxyError as e:
except brozzler.ProxyError:
if self._warcprox_auto:
self.logger.exception(
"proxy error, will try to choose a "
@ -676,7 +678,7 @@ class BrozzlerWorker:
try:
self.status_info = self._service_registry.heartbeat(status_info)
self.logger.debug("status in service registry", status=self.status_info)
except r.ReqlError as e:
except r.ReqlError:
self.logger.exception(
"failed to send heartbeat and update service registry",
info=status_info,
@ -748,11 +750,11 @@ class BrozzlerWorker:
time.sleep(0.5)
self.logger.warn("shutdown requested")
except r.ReqlError as e:
except r.ReqlError:
self.logger.exception("caught rethinkdb exception, will try to proceed")
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except:
except: # noqa: E722
self.logger.critical(
"thread exiting due to unexpected exception", exc_info=True
)
@ -760,7 +762,7 @@ class BrozzlerWorker:
if self._service_registry and hasattr(self, "status_info"):
try:
self._service_registry.unregister(self.status_info["id"])
except:
except: # noqa: E722
self.logger.exception("failed to unregister from service registry")
self.logger.info(

View File

@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
if result_type in ("url", "url_transparent"):
if "extraction_depth" in extra_info:
self.logger.info(
f"Following redirect",
"Following redirect",
redirect_url=ie_result["url"],
extraction_depth=extra_info["extraction_depth"],
)
@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
# use it later to extract the watch pages as outlinks.
try:
ie_result["entries_no_dl"] = list(ie_result["entries"])
except Exception as e:
except Exception:
extract_context.warning(
"failed to unroll entries ie_result['entries']?",
exc_info=True,
@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
import magic
mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e:
except ImportError:
mimetype = "video/%s" % info_dict["ext"]
self.logger.warning(
"guessing mimetype due to error",
@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
)
site.last_claimed = doublethink.utcnow()
site.save()
except:
except: # noqa: E722
worker.logger.debug(
"problem heartbeating site.last_claimed site",
id=site.id,
@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None):
"""
Saves info about videos captured by yt-dlp in `page.videos`.
"""
if not "videos" in page:
if "videos" not in page:
page.videos = []
for pushed_video in pushed_videos or []:
video = {
@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page):
)
metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
break
except brozzler.ShutdownRequested as e:
except brozzler.ShutdownRequested:
raise
except Exception as e:
if (

View File

@ -110,7 +110,7 @@ def test_httpd(httpd):
of the same url return the same payload, proving it can be used to test
deduplication.
"""
payload1 = content2 = None
payload1 = None
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
with urllib.request.urlopen(url) as response:
assert response.status == 200
@ -175,7 +175,6 @@ def test_420(httpd):
def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail

View File

@ -40,11 +40,11 @@ def cli_commands():
commands = set(console_scripts().keys())
commands.remove("brozzler-wayback")
try:
import gunicorn
import gunicorn # noqa: F401
except ImportError:
commands.remove("brozzler-dashboard")
try:
import pywb
import pywb # noqa: F401
except ImportError:
commands.remove("brozzler-easy")
return commands

View File

@ -23,7 +23,6 @@ import http.server
import os
import socket
import subprocess
import sys
import threading
import time
import urllib.request
@ -47,7 +46,7 @@ def _local_address():
try:
s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable
return s.getsockname()[0]
except:
except: # noqa: E722
return "127.0.0.1"
finally:
s.close()
@ -148,7 +147,7 @@ def test_httpd(httpd):
of the same url return the same payload, proving it can be used to test
deduplication.
"""
payload1 = content2 = None
payload1 = None
url = make_url(httpd, "/site1/file1.txt")
with urllib.request.urlopen(url) as response:
assert response.status == 200
@ -351,8 +350,8 @@ def test_warcprox_auto(httpd):
def test_proxy_conflict():
with pytest.raises(AssertionError) as excinfo:
worker = brozzler.worker.BrozzlerWorker(
with pytest.raises(AssertionError):
brozzler.worker.BrozzlerWorker(
None, None, warcprox_auto=True, proxy="localhost:12345"
)
@ -523,7 +522,6 @@ def test_login(httpd):
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = make_url(httpd, "/robots.txt")
captures = list(
rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
)
@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd):
def test_stop_crawl(httpd):
test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker("localhost", db="brozzler")
frontier = brozzler.RethinkDbFrontier(rr)
@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd):
"""
rr = doublethink.Rethinker("localhost", db="brozzler")
frontier = brozzler.RethinkDbFrontier(rr)
svcreg = doublethink.ServiceRegistry(rr)
# run two instances of warcprox
opts = warcprox.Options()
@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
# the system, if any
try:
stop_service("warcprox")
except Exception as e:
except Exception:
logger.warning("problem stopping warcprox service: %s", exc_info=True)
# queue the site for brozzling
@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd):
def test_time_limit(httpd):
test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker("localhost", db="brozzler")
frontier = brozzler.RethinkDbFrontier(rr)
@ -928,7 +923,6 @@ def test_time_limit(httpd):
sites = list(frontier.job_sites(job.id))
assert len(sites) == 1
site = sites[0]
# time limit should be enforced pretty soon
start = time.time()
@ -986,7 +980,7 @@ def test_ydl_stitching(httpd):
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = list(rr.table("captures").filter({"test_id": test_id}).run())
l = [c for c in captures if c["url"] == stitched_url]
l = [c for c in captures if c["url"] == stitched_url] # noqa: E741
assert len(l) == 1
c = l[0]
assert c["filename"].startswith("test_ydl_stitching")

View File

@ -559,7 +559,7 @@ def test_parent_url_scoping():
assert parent_page.outlinks["accepted"] == outlinks
# parent page redirect_url matches accept parent_url_regex
parent_page_c = brozzler.Page(
brozzler.Page(
rr,
{
"site_id": site.id,
@ -606,7 +606,7 @@ def test_parent_url_scoping():
assert parent_page.outlinks["accepted"] == []
# parent page redirect_url matches block parent_url_regex
parent_page_c = brozzler.Page(
brozzler.Page(
rr,
{
"site_id": site.id,
@ -659,10 +659,10 @@ def test_completed_page():
]
}
assert page.brozzle_count == 1
assert page.claimed == False
assert page.claimed is False
page.refresh()
assert page.brozzle_count == 1
assert page.claimed == False
assert page.claimed is False
# redirect that doesn't change scope surt because destination is covered by
# the original surt
@ -686,10 +686,10 @@ def test_completed_page():
site.refresh()
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
assert page.brozzle_count == 1
assert page.claimed == False
assert page.claimed is False
page.refresh()
assert page.brozzle_count == 1
assert page.claimed == False
assert page.claimed is False
# redirect that doesn't change scope surt because page is not the seed page
site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
@ -712,10 +712,10 @@ def test_completed_page():
site.refresh()
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
assert page.brozzle_count == 1
assert page.claimed == False
assert page.claimed is False
page.refresh()
assert page.brozzle_count == 1
assert page.claimed == False
assert page.claimed is False
def test_seed_page():
@ -931,7 +931,7 @@ def test_max_claimed_sites():
claimed_sites = frontier.claim_sites(3)
assert len(claimed_sites) == 2
with pytest.raises(brozzler.NothingToClaim):
claimed_site = frontier.claim_sites(3)
frontier.claim_sites(3)
# clean slate for the next one
rr.table("jobs").delete().run()
@ -1074,7 +1074,7 @@ def test_max_hops_off():
site.refresh() # get it back from the db
# renamed this param
assert not "max_hops_off_surt" in site.scope
assert "max_hops_off_surt" not in site.scope
assert site.scope["max_hops_off"] == 1
seed_page = frontier.seed_page(site.id)
@ -1109,7 +1109,7 @@ def test_max_hops_off():
assert len(pages) == 4
assert pages[0].url == "http://example.com/"
assert pages[0].hops_off == 0
assert not "hops_off_surt" in pages[0]
assert "hops_off_surt" not in pages[0]
assert set(pages[0].outlinks["accepted"]) == {
"https://example.com/toot",
"http://foo.org/",

View File

@ -21,7 +21,6 @@ import datetime
import http.server
import os
import socket
import sys
import tempfile
import threading
import time
@ -29,7 +28,6 @@ import uuid
from unittest import mock
import pytest
import requests
import yaml
import brozzler
@ -291,7 +289,7 @@ def test_proxy_down():
)
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
with pytest.raises(brozzler.ProxyError):
brozzler.ydl.do_youtube_dl(worker, site, page)
@ -315,7 +313,7 @@ def test_start_stop_backwards_compat():
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]["start"]
assert site.starts_and_stops[0]["stop"] is None
assert not "start_time" in site
assert "start_time" not in site
site = brozzler.Site(
None,
@ -324,13 +322,13 @@ def test_start_stop_backwards_compat():
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert site.starts_and_stops[0]["stop"] is None
assert not "start_time" in site
assert "start_time" not in site
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
assert job.starts_and_stops[0]["start"]
assert job.starts_and_stops[0]["stop"] is None
assert not "started" in job
assert not "finished" in job
assert "started" not in job
assert "finished" not in job
job = brozzler.Job(
None,
@ -342,8 +340,8 @@ def test_start_stop_backwards_compat():
)
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
assert not "started" in job
assert not "finished" in job
assert "started" not in job
assert "finished" not in job
class Exception1(Exception):
@ -452,9 +450,9 @@ def test_thread_raise_second_with_block():
with brozzler.thread_accept_exceptions():
time.sleep(2)
return # test fails
except Exception1 as e:
except Exception1:
pass
except:
except: # noqa: E722
return # fail test
try:

View File

@ -32,7 +32,7 @@ import sys
try:
from shlex import quote
except:
except: # noqa: E722
from pipes import quote