diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 4062fbd..2150190 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -17,10 +17,13 @@ See the License for the specific language governing permissions and limitations under the License. """ +import datetime import logging +import threading from importlib.metadata import version as _version import structlog +import urlcanon __version__ = _version("brozzler") @@ -91,7 +94,7 @@ def _logging_handler_handle(self, record): finally: try: self.release() - except: + except: # noqa: E722 pass return rv @@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None): `js-templates/`. Defaults to brozzler dir. """ import os - import string import yaml @@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None): """ Returns the javascript behavior string populated with template_parameters. """ - import json import re logger = structlog.get_logger(logger_name=__name__) @@ -194,8 +195,6 @@ class ThreadExceptionGate: return "" % self.thread -import threading - _thread_exception_gates = {} _thread_exception_gates_lock = threading.Lock() @@ -225,7 +224,7 @@ def thread_exception_gate(thread=None): thread = threading.current_thread() with _thread_exception_gates_lock: - if not thread in _thread_exception_gates: + if thread not in _thread_exception_gates: _thread_exception_gates[thread] = ThreadExceptionGate(thread) return _thread_exception_gates[thread] @@ -252,7 +251,6 @@ def thread_raise(thread, exctype): """ import ctypes import inspect - import threading import structlog @@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None): return _jinja2_env -import urlcanon - - def _remove_query(url): url.question_mark = b"" url.query = b"" @@ -403,13 +398,10 @@ def suggest_default_chrome_exe(): return "chromium-browser" -import datetime - EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc) - -from brozzler.browser import Browser, BrowserPool, BrowsingException -from brozzler.robots import is_permitted_by_robots +from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402 +from brozzler.robots import is_permitted_by_robots # noqa: E402 __all__ = [ "is_permitted_by_robots", @@ -422,22 +414,25 @@ __all__ = [ "suggest_default_chrome_exe", ] +# TODO try using importlib.util.find_spec to test for dependency presence +# rather than try/except on import. +# See https://docs.astral.sh/ruff/rules/unused-import/#example try: - import doublethink + import doublethink # noqa: F401 # All of these imports use doublethink for real and are unsafe # to do if doublethink is unavailable. - from brozzler.frontier import RethinkDbFrontier + from brozzler.frontier import RethinkDbFrontier # noqa: F401 from brozzler.model import ( - InvalidJobConf, - Job, - Page, - Site, - new_job, - new_job_file, - new_site, + InvalidJobConf, # noqa: F401 + Job, # noqa: F401 + Page, # noqa: F401 + Site, # noqa: F401 + new_job, # noqa: F401 + new_job_file, # noqa: F401 + new_site, # noqa: F401 ) - from brozzler.worker import BrozzlerWorker + from brozzler.worker import BrozzlerWorker # noqa: F401 __all__.extend( [ diff --git a/brozzler/browser.py b/brozzler/browser.py index 6b872e4..4e42438 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -18,7 +18,6 @@ limitations under the License. import base64 import datetime -import itertools import json import logging import socket @@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread): def _on_message(self, websock, message): try: self._handle_message(websock, message) - except: + except: # noqa: E722 self.logger.exception( "uncaught exception in _handle_message", message=message, @@ -430,7 +429,7 @@ class Browser: self.logger.info("shutting down websocket connection") try: self.websock.close() - except BaseException as e: + except BaseException: self.logger.exception( "exception closing websocket", websocket=self.websock ) @@ -458,7 +457,7 @@ class Browser: ) self.websock_url = None - except: + except: # noqa: E722 self.logger.exception("problem stopping") def is_running(self): @@ -628,7 +627,7 @@ class Browser: jpeg_bytes = self.screenshot(full_page) on_screenshot(jpeg_bytes) return - except BrowsingTimeout as e: + except BrowsingTimeout: self.logger.exception("attempt %s/3", i + 1) def visit_hashtags(self, page_url, hashtags, outlinks): @@ -807,12 +806,12 @@ class Browser: if ( msg and "result" in msg - and not ("exceptionDetails" in msg["result"]) + and "exceptionDetails" not in msg["result"] and not ( "wasThrown" in msg["result"] and msg["result"]["wasThrown"] ) and "result" in msg["result"] - and type(msg["result"]["result"]["value"]) == bool + and isinstance(msg["result"]["result"]["value"], bool) and msg["result"]["result"]["value"] ): self.logger.info("behavior decided it has finished") diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 7dd6066..3332d71 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -265,7 +265,7 @@ class Chrome: return url except brozzler.ShutdownRequested: raise - except Exception as e: + except Exception: if time.time() - self._last_warning > 30: url_logger.warning( "problem accessing url (will keep trying until timeout)", @@ -325,7 +325,7 @@ class Chrome: self.logger.debug( "chrome pid %s STDERR %s", self.chrome_process.pid, buf ) - except: + except: # noqa: E722 self.logger.exception("unexpected exception") def stop(self): @@ -378,7 +378,7 @@ class Chrome: self.chrome_process.stderr.close() try: self._home_tmpdir.cleanup() - except: + except: # noqa: E722 self.logger.exception( "exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir ) diff --git a/brozzler/cli.py b/brozzler/cli.py index 1fe5318..f18ad14 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -23,12 +23,10 @@ import datetime import json import logging import os -import re import signal import string import sys import threading -import time import traceback import warnings @@ -397,9 +395,9 @@ def brozzle_page(argv=None): enable_youtube_dl=not worker._skip_youtube_dl, ) logger.info("outlinks", outlinks=sorted(outlinks)) - except brozzler.ReachedLimit as e: + except brozzler.ReachedLimit: logger.exception("reached limit") - except brozzler.PageInterstitialShown as e: + except brozzler.PageInterstitialShown: logger.exception("page interstitial shown") finally: browser.stop() @@ -661,7 +659,7 @@ def brozzler_worker(argv=None): logger.info( "dumping state (caught signal)\n%s", signal=signum, state=state_strs ) - except BaseException as e: + except BaseException: logger.exception("exception dumping state") finally: signal.signal(signal.SIGQUIT, dump_state) @@ -672,11 +670,11 @@ def brozzler_worker(argv=None): try: # make set from seed IDs in SKIP_AV_SEEDS_FILE with open(SKIP_AV_SEEDS_FILE) as skips: - skip_av_seeds = {int(l) for l in skips.readlines()} + skip_av_seeds = {int(line) for line in skips.readlines()} logger.info( "running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE ) - except Exception as e: + except Exception: skip_av_seeds = set() logger.info("running with empty skip_av_seeds") return skip_av_seeds @@ -686,13 +684,13 @@ def brozzler_worker(argv=None): try: # make list from file with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints: - ytdlp_proxy_endpoints = [l for l in endpoints.readlines()] + ytdlp_proxy_endpoints = [line for line in endpoints.readlines()] if ytdlp_proxy_endpoints: logger.info( "running with ytdlp proxy endpoints file", ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE, ) - except Exception as e: + except Exception: ytdlp_proxy_endpoints = [] logger.info("running with empty proxy endpoints file") return ytdlp_proxy_endpoints @@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None): configure_logging(args) rr = rethinker(args) - frontier = brozzler.RethinkDbFrontier(rr) + brozzler.RethinkDbFrontier(rr) if args.job: try: job_id = int(args.job) diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index 15a836c..13ea214 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -17,9 +17,14 @@ See the License for the specific language governing permissions and limitations under the License. """ +import base64 +import os import sys +import doublethink +import rethinkdb as rdb import structlog +import yaml logger = structlog.get_logger(logger_name=__name__) @@ -33,14 +38,6 @@ except ImportError as e: e, ) sys.exit(1) -import base64 -import importlib -import json -import os - -import doublethink -import rethinkdb as rdb -import yaml r = rdb.RethinkDB() @@ -285,6 +282,8 @@ def root(path): try: + import logging + import gunicorn.app.base import gunicorn.glogging from gunicorn.six import iteritems diff --git a/brozzler/easy.py b/brozzler/easy.py index fc213f8..d8e8762 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -18,10 +18,22 @@ See the License for the specific language governing permissions and limitations under the License. """ +import argparse +import os +import signal +import socket +import socketserver import sys +import threading +import time +import traceback +import doublethink import structlog +import brozzler +import brozzler.cli + logger = structlog.get_logger(logger_name=__name__) try: @@ -42,19 +54,6 @@ except ImportError as e: exc_info=True, ) sys.exit(1) -import argparse -import os -import signal -import socket -import socketserver -import threading -import time -import traceback - -import doublethink - -import brozzler -import brozzler.cli def _build_arg_parser(argv=None): diff --git a/brozzler/frontier.py b/brozzler/frontier.py index b37d5d1..6e1ac63 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -16,10 +16,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -import datetime -import random -import time - import doublethink import rethinkdb as rdb import structlog @@ -47,11 +43,11 @@ class RethinkDbFrontier: db_logger = self.logger.bind(dbname=self.rr.dbname) dbs = self.rr.db_list().run() - if not self.rr.dbname in dbs: + if self.rr.dbname not in dbs: db_logger.info("creating rethinkdb database") self.rr.db_create(self.rr.dbname).run() tables = self.rr.table_list().run() - if not "sites" in tables: + if "sites" not in tables: db_logger.info("creating rethinkdb table 'sites' in database") self.rr.table_create( "sites", shards=self.shards, replicas=self.replicas @@ -60,7 +56,7 @@ class RethinkDbFrontier: "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]] ).run() self.rr.table("sites").index_create("job_id").run() - if not "pages" in tables: + if "pages" not in tables: db_logger.info("creating rethinkdb table 'pages' in database") self.rr.table_create( "pages", shards=self.shards, replicas=self.replicas @@ -80,7 +76,7 @@ class RethinkDbFrontier: "least_hops", [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]], ).run() - if not "jobs" in tables: + if "jobs" not in tables: db_logger.info("creating rethinkdb table 'jobs' in database") self.rr.table_create( "jobs", shards=self.shards, replicas=self.replicas @@ -352,7 +348,6 @@ class RethinkDbFrontier: site.save() def _build_fresh_page(self, site, parent_page, url, hops_off=0): - url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode( "utf-8" @@ -461,8 +456,8 @@ class RethinkDbFrontier: # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:" # there can be many pages and each one can be very large (many videos, # in and out of scope links, etc) - l = list(pages.values()) - for batch in (l[i : i + 50] for i in range(0, len(l), 50)): + pages_list = list(pages.values()) + for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)): try: self.logger.debug("inserting/replacing batch of %s pages", len(batch)) reql = self.rr.table("pages").insert(batch, conflict="replace") @@ -471,8 +466,8 @@ class RethinkDbFrontier: 'conflict="replace")', batch, ) - result = reql.run() - except Exception as e: + reql.run() + except Exception: self.logger.exception( "problem inserting/replacing batch of %s pages", len(batch), diff --git a/brozzler/model.py b/brozzler/model.py index 49a9905..38b371a 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -19,12 +19,9 @@ limitations under the License. import base64 import copy -import datetime import hashlib import json import os -import re -import time import urllib import uuid import zlib @@ -61,7 +58,7 @@ class InvalidJobConf(Exception): # debugged, I found it here. Maybe there's a better way to see it. value = validator._errors[0].info[0][0].info[0][0].value self.errors["bad value"] = value - except: + except: # noqa: E722 value = None @@ -122,10 +119,10 @@ def new_job(frontier, job_conf): # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)): logger.info("inserting batch of %s pages", len(batch)) - result = frontier.rr.table("pages").insert(batch).run() + frontier.rr.table("pages").insert(batch).run() for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)): logger.info("inserting batch of %s sites", len(batch)) - result = frontier.rr.table("sites").insert(batch).run() + frontier.rr.table("sites").insert(batch).run() logger.info("job fully started", job_id=job.id) return job @@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn): table = "jobs" def populate_defaults(self): - if not "status" in self: + if "status" not in self: self.status = "ACTIVE" - if not "starts_and_stops" in self: + if "starts_and_stops" not in self: if self.get("started"): # backward compatibility self.starts_and_stops = [ {"start": self.get("started"), "stop": self.get("finished")} @@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn): table = "sites" def populate_defaults(self): - if not "status" in self: + if "status" not in self: self.status = "ACTIVE" - if not "claimed" in self: + if "claimed" not in self: self.claimed = False - if not "last_disclaimed" in self: + if "last_disclaimed" not in self: self.last_disclaimed = brozzler.EPOCH_UTC - if not "last_claimed" in self: + if "last_claimed" not in self: self.last_claimed = brozzler.EPOCH_UTC - if not "scope" in self: + if "scope" not in self: self.scope = {} - if not "skip_ytdlp" in self: + if "skip_ytdlp" not in self: self.skip_ytdlp = None # backward compatibility if "surt" in self.scope: - if not "accepts" in self.scope: + if "accepts" not in self.scope: self.scope["accepts"] = [] self.scope["accepts"].append({"surt": self.scope["surt"]}) del self.scope["surt"] # backward compatibility - if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope: + if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope: self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] @@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn): brozzler.site_surt_canon(self.seed).ssurt().decode("ascii") ) - if not "starts_and_stops" in self: + if "starts_and_stops" not in self: if self.get("start_time"): # backward compatibility self.starts_and_stops = [ {"start": self.get("start_time"), "stop": None} @@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) def _accept_ssurt_if_not_redundant(self, ssurt): - if not "accepts" in self.scope: + if "accepts" not in self.scope: self.scope["accepts"] = [] simple_rule_ssurts = ( rule["ssurt"] @@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn): if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) - if not url.scheme in (b"http", b"https"): + if url.scheme not in (b"http", b"https"): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False @@ -390,31 +387,31 @@ class Page(doublethink.Document): return hashlib.sha1(digest_this.encode("utf-8")).hexdigest() def populate_defaults(self): - if not "retry_after" in self: + if "retry_after" not in self: self.retry_after = None - if not "failed_attempts" in self: + if "failed_attempts" not in self: self.failed_attempts = 0 - if not "hops_from_seed" in self: + if "hops_from_seed" not in self: self.hops_from_seed = 0 - if not "hop_path" in self: + if "hop_path" not in self: self.hop_path = None - if not "via_page_url" in self: + if "via_page_url" not in self: self.via_page_url = None - if not "brozzle_count" in self: + if "brozzle_count" not in self: self.brozzle_count = 0 - if not "claimed" in self: + if "claimed" not in self: self.claimed = False - if "hops_off_surt" in self and not "hops_off" in self: + if "hops_off_surt" in self and "hops_off" not in self: self.hops_off = self.hops_off_surt if "hops_off_surt" in self: del self["hops_off_surt"] - if not "hops_off" in self: + if "hops_off" not in self: self.hops_off = 0 - if not "needs_robots_check" in self: + if "needs_robots_check" not in self: self.needs_robots_check = False - if not "priority" in self: + if "priority" not in self: self.priority = self._calc_priority() - if not "id" in self: + if "id" not in self: self.id = self.compute_id(self.site_id, self.url) def __str__(self): diff --git a/brozzler/pywb.py b/brozzler/pywb.py index b4b6280..2f70614 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -18,9 +18,16 @@ See the License for the specific language governing permissions and limitations under the License. """ +import argparse +import json import sys +import doublethink +import rethinkdb as rdb import structlog +import urlcanon + +import brozzler logger = structlog.get_logger(logger_name=__name__) @@ -40,14 +47,7 @@ except ImportError as e: e, ) sys.exit(1) -import argparse -import json -import doublethink -import rethinkdb as rdb -import urlcanon - -import brozzler r = rdb.RethinkDB() @@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object): key = urlcanon.semantic(url).surt().decode("ascii") # logging.debug('%s -> %s', url, key) return key - except Exception as e: + except Exception: return url def replace_default_canonicalizer(): @@ -221,18 +221,9 @@ def support_in_progress_warcs(): class SomeWbUrl(pywb.rewrite.wburl.WbUrl): def __init__(self, orig_url): - import re - import six from pywb.rewrite.wburl import WbUrl - from pywb.utils.loaders import to_native_str - from six.moves.urllib.parse import ( - quote, - quote_plus, - unquote_plus, - urlsplit, - urlunsplit, - ) + from six.moves.urllib.parse import quote pywb.rewrite.wburl.BaseWbUrl.__init__(self) @@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query): urlkey = to_native_str(query.key, "utf-8") url = query.url filter_ = query.filters - output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) diff --git a/brozzler/robots.py b/brozzler/robots.py index 7f249dd..2e0bd67 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -71,7 +71,7 @@ _robots_caches = {} # {site_id:reppy.cache.RobotsCache} def _robots_cache(site, proxy=None): - if not site.id in _robots_caches: + if site.id not in _robots_caches: req_sesh = _SessionRaiseOn420() req_sesh.verify = False # ignore cert errors if proxy: diff --git a/brozzler/worker.py b/brozzler/worker.py index 3a58bfa..807195b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -21,9 +21,7 @@ limitations under the License. import datetime import io import json -import random import socket -import tempfile import threading import time import urllib.request @@ -99,9 +97,13 @@ class BrozzlerWorker: self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + # TODO try using importlib.util.find_spec to test for dependency + # presence rather than try/except on import. + # See https://docs.astral.sh/ruff/rules/unused-import/#example + # We definitely shouldn't ytdlp if the optional extra is missing try: - import yt_dlp + import yt_dlp # noqa: F401 except ImportError: self.logger.info( "optional yt-dlp extra not installed; setting skip_youtube_dl to True" @@ -200,7 +202,7 @@ class BrozzlerWorker: response = requests.get("http://%s/status" % self._proxy) status = json.loads(response.text) self._proxy_is_warcprox = status["role"] == "warcprox" - except Exception as e: + except Exception: self._proxy_is_warcprox = False self.logger.info( "%s %s warcprox", @@ -348,13 +350,13 @@ class BrozzlerWorker: ) metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) - except brozzler.ReachedLimit as e: + except brozzler.ReachedLimit: raise except brozzler.ShutdownRequested: raise except brozzler.ProxyError: raise - except brozzler.VideoExtractorError as e: + except brozzler.VideoExtractorError: self.logger.exception("error extracting video info") except Exception as e: if ( @@ -391,9 +393,9 @@ class BrozzlerWorker: timeout=self.HEADER_REQUEST_TIMEOUT, ) as r: return r.headers - except requests.exceptions.Timeout as e: + except requests.exceptions.Timeout: url_logger.warning("Timed out trying to get headers", exc_info=True) - except requests.exceptions.RequestException as e: + except requests.exceptions.RequestException: url_logger.warning("Failed to get headers", exc_info=True) return {} @@ -469,7 +471,7 @@ class BrozzlerWorker: if "content-range" in response_headers: video["content-range"] = response_headers["content-range"] self.logger.debug("embedded video", video=video) - if not "videos" in page: + if "videos" not in page: page.videos = [] page.videos.append(video) @@ -598,13 +600,13 @@ class BrozzlerWorker: site_logger.info("no pages left for site") except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) - except brozzler.ReachedTimeLimit as e: + except brozzler.ReachedTimeLimit: self._frontier.finished(site, "FINISHED_TIME_LIMIT") except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) - except brozzler.ProxyError as e: + except brozzler.ProxyError: if self._warcprox_auto: self.logger.exception( "proxy error, will try to choose a " @@ -676,7 +678,7 @@ class BrozzlerWorker: try: self.status_info = self._service_registry.heartbeat(status_info) self.logger.debug("status in service registry", status=self.status_info) - except r.ReqlError as e: + except r.ReqlError: self.logger.exception( "failed to send heartbeat and update service registry", info=status_info, @@ -748,11 +750,11 @@ class BrozzlerWorker: time.sleep(0.5) self.logger.warn("shutdown requested") - except r.ReqlError as e: + except r.ReqlError: self.logger.exception("caught rethinkdb exception, will try to proceed") except brozzler.ShutdownRequested: self.logger.info("shutdown requested") - except: + except: # noqa: E722 self.logger.critical( "thread exiting due to unexpected exception", exc_info=True ) @@ -760,7 +762,7 @@ class BrozzlerWorker: if self._service_registry and hasattr(self, "status_info"): try: self._service_registry.unregister(self.status_info["id"]) - except: + except: # noqa: E722 self.logger.exception("failed to unregister from service registry") self.logger.info( diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f81d551..df2fe82 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): if result_type in ("url", "url_transparent"): if "extraction_depth" in extra_info: self.logger.info( - f"Following redirect", + "Following redirect", redirect_url=ie_result["url"], extraction_depth=extra_info["extraction_depth"], ) @@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): # use it later to extract the watch pages as outlinks. try: ie_result["entries_no_dl"] = list(ie_result["entries"]) - except Exception as e: + except Exception: extract_context.warning( "failed to unroll entries ie_result['entries']?", exc_info=True, @@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): import magic mimetype = magic.from_file(info_dict["filepath"], mime=True) - except ImportError as e: + except ImportError: mimetype = "video/%s" % info_dict["ext"] self.logger.warning( "guessing mimetype due to error", @@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): ) site.last_claimed = doublethink.utcnow() site.save() - except: + except: # noqa: E722 worker.logger.debug( "problem heartbeating site.last_claimed site", id=site.id, @@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None): """ Saves info about videos captured by yt-dlp in `page.videos`. """ - if not "videos" in page: + if "videos" not in page: page.videos = [] for pushed_video in pushed_videos or []: video = { @@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page): ) metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1) break - except brozzler.ShutdownRequested as e: + except brozzler.ShutdownRequested: raise except Exception as e: if ( diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 03124a4..3a62919 100755 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -110,7 +110,7 @@ def test_httpd(httpd): of the same url return the same payload, proving it can be used to test deduplication. """ - payload1 = content2 = None + payload1 = None url = "http://localhost:%s/site1/file1.txt" % httpd.server_port with urllib.request.urlopen(url) as response: assert response.status == 200 @@ -175,7 +175,6 @@ def test_420(httpd): def test_js_dialogs(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() - url = "http://localhost:%s/site4/alert.html" % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: # before commit d2ed6b97a24 these would hang and eventually raise # brozzler.browser.BrowsingTimeout, which would cause this test to fail diff --git a/tests/test_cli.py b/tests/test_cli.py index 3c8f0c5..3f9c382 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -40,11 +40,11 @@ def cli_commands(): commands = set(console_scripts().keys()) commands.remove("brozzler-wayback") try: - import gunicorn + import gunicorn # noqa: F401 except ImportError: commands.remove("brozzler-dashboard") try: - import pywb + import pywb # noqa: F401 except ImportError: commands.remove("brozzler-easy") return commands diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 5949fd3..58e9d13 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -23,7 +23,6 @@ import http.server import os import socket import subprocess -import sys import threading import time import urllib.request @@ -47,7 +46,7 @@ def _local_address(): try: s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable return s.getsockname()[0] - except: + except: # noqa: E722 return "127.0.0.1" finally: s.close() @@ -148,7 +147,7 @@ def test_httpd(httpd): of the same url return the same payload, proving it can be used to test deduplication. """ - payload1 = content2 = None + payload1 = None url = make_url(httpd, "/site1/file1.txt") with urllib.request.urlopen(url) as response: assert response.status == 200 @@ -351,8 +350,8 @@ def test_warcprox_auto(httpd): def test_proxy_conflict(): - with pytest.raises(AssertionError) as excinfo: - worker = brozzler.worker.BrozzlerWorker( + with pytest.raises(AssertionError): + brozzler.worker.BrozzlerWorker( None, None, warcprox_auto=True, proxy="localhost:12345" ) @@ -523,7 +522,6 @@ def test_login(httpd): # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls - robots_url = make_url(httpd, "/robots.txt") captures = list( rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run() ) @@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd): def test_stop_crawl(httpd): - test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) @@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd): """ rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) - svcreg = doublethink.ServiceRegistry(rr) # run two instances of warcprox opts = warcprox.Options() @@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd): # the system, if any try: stop_service("warcprox") - except Exception as e: + except Exception: logger.warning("problem stopping warcprox service: %s", exc_info=True) # queue the site for brozzling @@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd): def test_time_limit(httpd): - test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) @@ -928,7 +923,6 @@ def test_time_limit(httpd): sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 - site = sites[0] # time limit should be enforced pretty soon start = time.time() @@ -986,7 +980,7 @@ def test_ydl_stitching(httpd): time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = list(rr.table("captures").filter({"test_id": test_id}).run()) - l = [c for c in captures if c["url"] == stitched_url] + l = [c for c in captures if c["url"] == stitched_url] # noqa: E741 assert len(l) == 1 c = l[0] assert c["filename"].startswith("test_ydl_stitching") diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 31628c4..a29e8e2 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -559,7 +559,7 @@ def test_parent_url_scoping(): assert parent_page.outlinks["accepted"] == outlinks # parent page redirect_url matches accept parent_url_regex - parent_page_c = brozzler.Page( + brozzler.Page( rr, { "site_id": site.id, @@ -606,7 +606,7 @@ def test_parent_url_scoping(): assert parent_page.outlinks["accepted"] == [] # parent page redirect_url matches block parent_url_regex - parent_page_c = brozzler.Page( + brozzler.Page( rr, { "site_id": site.id, @@ -659,10 +659,10 @@ def test_completed_page(): ] } assert page.brozzle_count == 1 - assert page.claimed == False + assert page.claimed is False page.refresh() assert page.brozzle_count == 1 - assert page.claimed == False + assert page.claimed is False # redirect that doesn't change scope surt because destination is covered by # the original surt @@ -686,10 +686,10 @@ def test_completed_page(): site.refresh() assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} assert page.brozzle_count == 1 - assert page.claimed == False + assert page.claimed is False page.refresh() assert page.brozzle_count == 1 - assert page.claimed == False + assert page.claimed is False # redirect that doesn't change scope surt because page is not the seed page site = brozzler.Site(rr, {"seed": "http://example.com/a/"}) @@ -712,10 +712,10 @@ def test_completed_page(): site.refresh() assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} assert page.brozzle_count == 1 - assert page.claimed == False + assert page.claimed is False page.refresh() assert page.brozzle_count == 1 - assert page.claimed == False + assert page.claimed is False def test_seed_page(): @@ -931,7 +931,7 @@ def test_max_claimed_sites(): claimed_sites = frontier.claim_sites(3) assert len(claimed_sites) == 2 with pytest.raises(brozzler.NothingToClaim): - claimed_site = frontier.claim_sites(3) + frontier.claim_sites(3) # clean slate for the next one rr.table("jobs").delete().run() @@ -1074,7 +1074,7 @@ def test_max_hops_off(): site.refresh() # get it back from the db # renamed this param - assert not "max_hops_off_surt" in site.scope + assert "max_hops_off_surt" not in site.scope assert site.scope["max_hops_off"] == 1 seed_page = frontier.seed_page(site.id) @@ -1109,7 +1109,7 @@ def test_max_hops_off(): assert len(pages) == 4 assert pages[0].url == "http://example.com/" assert pages[0].hops_off == 0 - assert not "hops_off_surt" in pages[0] + assert "hops_off_surt" not in pages[0] assert set(pages[0].outlinks["accepted"]) == { "https://example.com/toot", "http://foo.org/", diff --git a/tests/test_units.py b/tests/test_units.py index 33979bc..55399de 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -21,7 +21,6 @@ import datetime import http.server import os import socket -import sys import tempfile import threading import time @@ -29,7 +28,6 @@ import uuid from unittest import mock import pytest -import requests import yaml import brozzler @@ -291,7 +289,7 @@ def test_proxy_down(): ) # youtube-dl fetch - with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: + with tempfile.TemporaryDirectory(prefix="brzl-ydl-"): with pytest.raises(brozzler.ProxyError): brozzler.ydl.do_youtube_dl(worker, site, page) @@ -315,7 +313,7 @@ def test_start_stop_backwards_compat(): assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]["start"] assert site.starts_and_stops[0]["stop"] is None - assert not "start_time" in site + assert "start_time" not in site site = brozzler.Site( None, @@ -324,13 +322,13 @@ def test_start_stop_backwards_compat(): assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1) assert site.starts_and_stops[0]["stop"] is None - assert not "start_time" in site + assert "start_time" not in site job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]}) assert job.starts_and_stops[0]["start"] assert job.starts_and_stops[0]["stop"] is None - assert not "started" in job - assert not "finished" in job + assert "started" not in job + assert "finished" not in job job = brozzler.Job( None, @@ -342,8 +340,8 @@ def test_start_stop_backwards_compat(): ) assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1) assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2) - assert not "started" in job - assert not "finished" in job + assert "started" not in job + assert "finished" not in job class Exception1(Exception): @@ -452,9 +450,9 @@ def test_thread_raise_second_with_block(): with brozzler.thread_accept_exceptions(): time.sleep(2) return # test fails - except Exception1 as e: + except Exception1: pass - except: + except: # noqa: E722 return # fail test try: diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index b76ff84..a2632b7 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -32,7 +32,7 @@ import sys try: from shlex import quote -except: +except: # noqa: E722 from pipes import quote