ruff linting fixes (#343)

* ruff linting fixes * move imports back down to where they're re-exported
2025-07-23 06:50:37 -04:00 · 2025-03-07 16:03:35 -08:00 · 2025-03-07 16:03:35 -08:00 · f64db214d4
commit f64db214d4
parent 6f011cc6c8
18 changed files with 155 additions and 190 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -17,10 +17,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 import datetime
 import logging
 import threading
 from importlib.metadata import version as _version
 import structlog
 import urlcanon
 __version__ = _version("brozzler")
@ -91,7 +94,7 @@ def _logging_handler_handle(self, record):
        finally:
            try:
                self.release()
-            except:
+            except:  # noqa: E722
                pass
    return rv
@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None):
    `js-templates/`. Defaults to brozzler dir.
    """
    import os
    import string
    import yaml
@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
    """
    Returns the javascript behavior string populated with template_parameters.
    """
    import json
    import re
    logger = structlog.get_logger(logger_name=__name__)
@ -194,8 +195,6 @@ class ThreadExceptionGate:
        return "<ThreadExceptionGate(%s)>" % self.thread
 import threading
 _thread_exception_gates = {}
 _thread_exception_gates_lock = threading.Lock()
@ -225,7 +224,7 @@ def thread_exception_gate(thread=None):
        thread = threading.current_thread()
    with _thread_exception_gates_lock:
-        if not thread in _thread_exception_gates:
+        if thread not in _thread_exception_gates:
            _thread_exception_gates[thread] = ThreadExceptionGate(thread)
    return _thread_exception_gates[thread]
@ -252,7 +251,6 @@ def thread_raise(thread, exctype):
    """
    import ctypes
    import inspect
    import threading
    import structlog
@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None):
    return _jinja2_env
 import urlcanon
 def _remove_query(url):
    url.question_mark = b""
    url.query = b""
@ -403,13 +398,10 @@ def suggest_default_chrome_exe():
    return "chromium-browser"
 import datetime
 EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
-
+from brozzler.browser import Browser, BrowserPool, BrowsingException  # noqa: E402
-from brozzler.browser import Browser, BrowserPool, BrowsingException
+from brozzler.robots import is_permitted_by_robots  # noqa: E402
 from brozzler.robots import is_permitted_by_robots
 __all__ = [
    "is_permitted_by_robots",
@ -422,22 +414,25 @@ __all__ = [
    "suggest_default_chrome_exe",
 ]
 # TODO try using importlib.util.find_spec to test for dependency presence
 # rather than try/except on import.
 # See https://docs.astral.sh/ruff/rules/unused-import/#example
 try:
-    import doublethink
+    import doublethink  # noqa: F401
    # All of these imports use doublethink for real and are unsafe
    # to do if doublethink is unavailable.
-    from brozzler.frontier import RethinkDbFrontier
+    from brozzler.frontier import RethinkDbFrontier  # noqa: F401
    from brozzler.model import (
-        InvalidJobConf,
+        InvalidJobConf,  # noqa: F401
-        Job,
+        Job,  # noqa: F401
-        Page,
+        Page,  # noqa: F401
-        Site,
+        Site,  # noqa: F401
-        new_job,
+        new_job,  # noqa: F401
-        new_job_file,
+        new_job_file,  # noqa: F401
-        new_site,
+        new_site,  # noqa: F401
    )
-    from brozzler.worker import BrozzlerWorker
+    from brozzler.worker import BrozzlerWorker  # noqa: F401
    __all__.extend(
        [
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -18,7 +18,6 @@ limitations under the License.
 import base64
 import datetime
 import itertools
 import json
 import logging
 import socket
@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread):
    def _on_message(self, websock, message):
        try:
            self._handle_message(websock, message)
-        except:
+        except:  # noqa: E722
            self.logger.exception(
                "uncaught exception in _handle_message",
                message=message,
@ -430,7 +429,7 @@ class Browser:
                self.logger.info("shutting down websocket connection")
                try:
                    self.websock.close()
-                except BaseException as e:
+                except BaseException:
                    self.logger.exception(
                        "exception closing websocket", websocket=self.websock
                    )
@ -458,7 +457,7 @@ class Browser:
                        )
            self.websock_url = None
-        except:
+        except:  # noqa: E722
            self.logger.exception("problem stopping")
    def is_running(self):
@ -628,7 +627,7 @@ class Browser:
                jpeg_bytes = self.screenshot(full_page)
                on_screenshot(jpeg_bytes)
                return
-            except BrowsingTimeout as e:
+            except BrowsingTimeout:
                self.logger.exception("attempt %s/3", i + 1)
    def visit_hashtags(self, page_url, hashtags, outlinks):
@ -807,12 +806,12 @@ class Browser:
                if (
                    msg
                    and "result" in msg
-                    and not ("exceptionDetails" in msg["result"])
+                    and "exceptionDetails" not in msg["result"]
                    and not (
                        "wasThrown" in msg["result"] and msg["result"]["wasThrown"]
                    )
                    and "result" in msg["result"]
-                    and type(msg["result"]["result"]["value"]) == bool
+                    and isinstance(msg["result"]["result"]["value"], bool)
                    and msg["result"]["result"]["value"]
                ):
                    self.logger.info("behavior decided it has finished")
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -265,7 +265,7 @@ class Chrome:
                    return url
            except brozzler.ShutdownRequested:
                raise
-            except Exception as e:
+            except Exception:
                if time.time() - self._last_warning > 30:
                    url_logger.warning(
                        "problem accessing url (will keep trying until timeout)",
@ -325,7 +325,7 @@ class Chrome:
                    self.logger.debug(
                        "chrome pid %s STDERR %s", self.chrome_process.pid, buf
                    )
-        except:
+        except:  # noqa: E722
            self.logger.exception("unexpected exception")
    def stop(self):
@ -378,7 +378,7 @@ class Chrome:
            self.chrome_process.stderr.close()
            try:
                self._home_tmpdir.cleanup()
-            except:
+            except:  # noqa: E722
                self.logger.exception(
                    "exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
                )
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -23,12 +23,10 @@ import datetime
 import json
 import logging
 import os
 import re
 import signal
 import string
 import sys
 import threading
 import time
 import traceback
 import warnings
@ -397,9 +395,9 @@ def brozzle_page(argv=None):
            enable_youtube_dl=not worker._skip_youtube_dl,
        )
        logger.info("outlinks", outlinks=sorted(outlinks))
-    except brozzler.ReachedLimit as e:
+    except brozzler.ReachedLimit:
        logger.exception("reached limit")
-    except brozzler.PageInterstitialShown as e:
+    except brozzler.PageInterstitialShown:
        logger.exception("page interstitial shown")
    finally:
        browser.stop()
@ -661,7 +659,7 @@ def brozzler_worker(argv=None):
            logger.info(
                "dumping state (caught signal)\n%s", signal=signum, state=state_strs
            )
-        except BaseException as e:
+        except BaseException:
            logger.exception("exception dumping state")
        finally:
            signal.signal(signal.SIGQUIT, dump_state)
@ -672,11 +670,11 @@ def brozzler_worker(argv=None):
        try:
            # make set from seed IDs in SKIP_AV_SEEDS_FILE
            with open(SKIP_AV_SEEDS_FILE) as skips:
-                skip_av_seeds = {int(l) for l in skips.readlines()}
+                skip_av_seeds = {int(line) for line in skips.readlines()}
                logger.info(
                    "running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
                )
-        except Exception as e:
+        except Exception:
            skip_av_seeds = set()
            logger.info("running with empty skip_av_seeds")
        return skip_av_seeds
@ -686,13 +684,13 @@ def brozzler_worker(argv=None):
        try:
            # make list from file
            with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
-                ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
+                ytdlp_proxy_endpoints = [line for line in endpoints.readlines()]
                if ytdlp_proxy_endpoints:
                    logger.info(
                        "running with ytdlp proxy endpoints file",
                        ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
                    )
-        except Exception as e:
+        except Exception:
            ytdlp_proxy_endpoints = []
            logger.info("running with empty proxy endpoints file")
        return ytdlp_proxy_endpoints
@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None):
    configure_logging(args)
    rr = rethinker(args)
-    frontier = brozzler.RethinkDbFrontier(rr)
+    brozzler.RethinkDbFrontier(rr)
    if args.job:
        try:
            job_id = int(args.job)
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -17,9 +17,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 import base64
 import os
 import sys
 import doublethink
 import rethinkdb as rdb
 import structlog
 import yaml
 logger = structlog.get_logger(logger_name=__name__)
@ -33,14 +38,6 @@ except ImportError as e:
        e,
    )
    sys.exit(1)
 import base64
 import importlib
 import json
 import os
 import doublethink
 import rethinkdb as rdb
 import yaml
 r = rdb.RethinkDB()
@ -285,6 +282,8 @@ def root(path):
 try:
    import logging
    import gunicorn.app.base
    import gunicorn.glogging
    from gunicorn.six import iteritems
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -18,10 +18,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 import argparse
 import os
 import signal
 import socket
 import socketserver
 import sys
 import threading
 import time
 import traceback
 import doublethink
 import structlog
 import brozzler
 import brozzler.cli
 logger = structlog.get_logger(logger_name=__name__)
 try:
@ -42,19 +54,6 @@ except ImportError as e:
        exc_info=True,
    )
    sys.exit(1)
 import argparse
 import os
 import signal
 import socket
 import socketserver
 import threading
 import time
 import traceback
 import doublethink
 import brozzler
 import brozzler.cli
 def _build_arg_parser(argv=None):
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -16,10 +16,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 import datetime
 import random
 import time
 import doublethink
 import rethinkdb as rdb
 import structlog
@ -47,11 +43,11 @@ class RethinkDbFrontier:
        db_logger = self.logger.bind(dbname=self.rr.dbname)
        dbs = self.rr.db_list().run()
-        if not self.rr.dbname in dbs:
+        if self.rr.dbname not in dbs:
            db_logger.info("creating rethinkdb database")
            self.rr.db_create(self.rr.dbname).run()
        tables = self.rr.table_list().run()
-        if not "sites" in tables:
+        if "sites" not in tables:
            db_logger.info("creating rethinkdb table 'sites' in database")
            self.rr.table_create(
                "sites", shards=self.shards, replicas=self.replicas
@ -60,7 +56,7 @@ class RethinkDbFrontier:
                "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
            ).run()
            self.rr.table("sites").index_create("job_id").run()
-        if not "pages" in tables:
+        if "pages" not in tables:
            db_logger.info("creating rethinkdb table 'pages' in database")
            self.rr.table_create(
                "pages", shards=self.shards, replicas=self.replicas
@ -80,7 +76,7 @@ class RethinkDbFrontier:
                "least_hops",
                [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
            ).run()
-        if not "jobs" in tables:
+        if "jobs" not in tables:
            db_logger.info("creating rethinkdb table 'jobs' in database")
            self.rr.table_create(
                "jobs", shards=self.shards, replicas=self.replicas
@ -352,7 +348,6 @@ class RethinkDbFrontier:
        site.save()
    def _build_fresh_page(self, site, parent_page, url, hops_off=0):
        url_for_scoping = urlcanon.semantic(url)
        url_for_crawling = urlcanon.whatwg(url)
        hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
            "utf-8"
@ -461,8 +456,8 @@ class RethinkDbFrontier:
        # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
        # there can be many pages and each one can be very large (many videos,
        # in and out of scope links, etc)
-        l = list(pages.values())
+        pages_list = list(pages.values())
-        for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
+        for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)):
            try:
                self.logger.debug("inserting/replacing batch of %s pages", len(batch))
                reql = self.rr.table("pages").insert(batch, conflict="replace")
@ -471,8 +466,8 @@ class RethinkDbFrontier:
                    'conflict="replace")',
                    batch,
                )
-                result = reql.run()
+                reql.run()
-            except Exception as e:
+            except Exception:
                self.logger.exception(
                    "problem inserting/replacing batch of %s pages",
                    len(batch),
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -19,12 +19,9 @@ limitations under the License.
 import base64
 import copy
 import datetime
 import hashlib
 import json
 import os
 import re
 import time
 import urllib
 import uuid
 import zlib
@ -61,7 +58,7 @@ class InvalidJobConf(Exception):
            # debugged, I found it here. Maybe there's a better way to see it.
            value = validator._errors[0].info[0][0].info[0][0].value
            self.errors["bad value"] = value
-        except:
+        except:  # noqa: E722
            value = None
@ -122,10 +119,10 @@ def new_job(frontier, job_conf):
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
        logger.info("inserting batch of %s pages", len(batch))
-        result = frontier.rr.table("pages").insert(batch).run()
+        frontier.rr.table("pages").insert(batch).run()
    for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
        logger.info("inserting batch of %s sites", len(batch))
-        result = frontier.rr.table("sites").insert(batch).run()
+        frontier.rr.table("sites").insert(batch).run()
    logger.info("job fully started", job_id=job.id)
    return job
@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn):
    table = "jobs"
    def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
            self.status = "ACTIVE"
-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
            if self.get("started"):  # backward compatibility
                self.starts_and_stops = [
                    {"start": self.get("started"), "stop": self.get("finished")}
@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn):
    table = "sites"
    def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
            self.status = "ACTIVE"
-        if not "claimed" in self:
+        if "claimed" not in self:
            self.claimed = False
-        if not "last_disclaimed" in self:
+        if "last_disclaimed" not in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
-        if not "last_claimed" in self:
+        if "last_claimed" not in self:
            self.last_claimed = brozzler.EPOCH_UTC
-        if not "scope" in self:
+        if "scope" not in self:
            self.scope = {}
-        if not "skip_ytdlp" in self:
+        if "skip_ytdlp" not in self:
            self.skip_ytdlp = None
        # backward compatibility
        if "surt" in self.scope:
-            if not "accepts" in self.scope:
+            if "accepts" not in self.scope:
                self.scope["accepts"] = []
            self.scope["accepts"].append({"surt": self.scope["surt"]})
            del self.scope["surt"]
        # backward compatibility
-        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
+        if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]
@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn):
                brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
            )
-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
            if self.get("start_time"):  # backward compatibility
                self.starts_and_stops = [
                    {"start": self.get("start_time"), "stop": None}
@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
    def _accept_ssurt_if_not_redundant(self, ssurt):
-        if not "accepts" in self.scope:
+        if "accepts" not in self.scope:
            self.scope["accepts"] = []
        simple_rule_ssurts = (
            rule["ssurt"]
@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)
-        if not url.scheme in (b"http", b"https"):
+        if url.scheme not in (b"http", b"https"):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
@ -390,31 +387,31 @@ class Page(doublethink.Document):
        return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
    def populate_defaults(self):
-        if not "retry_after" in self:
+        if "retry_after" not in self:
            self.retry_after = None
-        if not "failed_attempts" in self:
+        if "failed_attempts" not in self:
            self.failed_attempts = 0
-        if not "hops_from_seed" in self:
+        if "hops_from_seed" not in self:
            self.hops_from_seed = 0
-        if not "hop_path" in self:
+        if "hop_path" not in self:
            self.hop_path = None
-        if not "via_page_url" in self:
+        if "via_page_url" not in self:
            self.via_page_url = None
-        if not "brozzle_count" in self:
+        if "brozzle_count" not in self:
            self.brozzle_count = 0
-        if not "claimed" in self:
+        if "claimed" not in self:
            self.claimed = False
-        if "hops_off_surt" in self and not "hops_off" in self:
+        if "hops_off_surt" in self and "hops_off" not in self:
            self.hops_off = self.hops_off_surt
        if "hops_off_surt" in self:
            del self["hops_off_surt"]
-        if not "hops_off" in self:
+        if "hops_off" not in self:
            self.hops_off = 0
-        if not "needs_robots_check" in self:
+        if "needs_robots_check" not in self:
            self.needs_robots_check = False
-        if not "priority" in self:
+        if "priority" not in self:
            self.priority = self._calc_priority()
-        if not "id" in self:
+        if "id" not in self:
            self.id = self.compute_id(self.site_id, self.url)
    def __str__(self):
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -18,9 +18,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 import argparse
 import json
 import sys
 import doublethink
 import rethinkdb as rdb
 import structlog
 import urlcanon
 import brozzler
 logger = structlog.get_logger(logger_name=__name__)
@ -40,14 +47,7 @@ except ImportError as e:
        e,
    )
    sys.exit(1)
 import argparse
 import json
 import doublethink
 import rethinkdb as rdb
 import urlcanon
 import brozzler
 r = rdb.RethinkDB()
@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object):
            key = urlcanon.semantic(url).surt().decode("ascii")
            # logging.debug('%s -> %s', url, key)
            return key
-        except Exception as e:
+        except Exception:
            return url
    def replace_default_canonicalizer():
@ -221,18 +221,9 @@ def support_in_progress_warcs():
 class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
    def __init__(self, orig_url):
        import re
        import six
        from pywb.rewrite.wburl import WbUrl
-        from pywb.utils.loaders import to_native_str
+        from six.moves.urllib.parse import quote
        from six.moves.urllib.parse import (
            quote,
            quote_plus,
            unquote_plus,
            urlsplit,
            urlunsplit,
        )
        pywb.rewrite.wburl.BaseWbUrl.__init__(self)
@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query):
    urlkey = to_native_str(query.key, "utf-8")
    url = query.url
    filter_ = query.filters
    output = query.output
    for rule in self.rules.iter_matching(urlkey):
        m = rule.regex.search(urlkey)
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -71,7 +71,7 @@ _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site, proxy=None):
-    if not site.id in _robots_caches:
+    if site.id not in _robots_caches:
        req_sesh = _SessionRaiseOn420()
        req_sesh.verify = False  # ignore cert errors
        if proxy:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -21,9 +21,7 @@ limitations under the License.
 import datetime
 import io
 import json
 import random
 import socket
 import tempfile
 import threading
 import time
 import urllib.request
@ -99,9 +97,13 @@ class BrozzlerWorker:
        self._skip_visit_hashtags = skip_visit_hashtags
        self._skip_youtube_dl = skip_youtube_dl
        # TODO try using importlib.util.find_spec to test for dependency
        # presence rather than try/except on import.
        # See https://docs.astral.sh/ruff/rules/unused-import/#example
        # We definitely shouldn't ytdlp if the optional extra is missing
        try:
-            import yt_dlp
+            import yt_dlp  # noqa: F401
        except ImportError:
            self.logger.info(
                "optional yt-dlp extra not installed; setting skip_youtube_dl to True"
@ -200,7 +202,7 @@ class BrozzlerWorker:
                    response = requests.get("http://%s/status" % self._proxy)
                    status = json.loads(response.text)
                    self._proxy_is_warcprox = status["role"] == "warcprox"
-                except Exception as e:
+                except Exception:
                    self._proxy_is_warcprox = False
                self.logger.info(
                    "%s %s warcprox",
@ -348,13 +350,13 @@ class BrozzlerWorker:
                    )
                    metrics.brozzler_ydl_urls_checked.inc(1)
                    outlinks.update(ydl_outlinks)
-                except brozzler.ReachedLimit as e:
+                except brozzler.ReachedLimit:
                    raise
                except brozzler.ShutdownRequested:
                    raise
                except brozzler.ProxyError:
                    raise
-                except brozzler.VideoExtractorError as e:
+                except brozzler.VideoExtractorError:
                    self.logger.exception("error extracting video info")
                except Exception as e:
                    if (
@ -391,9 +393,9 @@ class BrozzlerWorker:
                timeout=self.HEADER_REQUEST_TIMEOUT,
            ) as r:
                return r.headers
-        except requests.exceptions.Timeout as e:
+        except requests.exceptions.Timeout:
            url_logger.warning("Timed out trying to get headers", exc_info=True)
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
            url_logger.warning("Failed to get headers", exc_info=True)
        return {}
@ -469,7 +471,7 @@ class BrozzlerWorker:
                if "content-range" in response_headers:
                    video["content-range"] = response_headers["content-range"]
                self.logger.debug("embedded video", video=video)
-                if not "videos" in page:
+                if "videos" not in page:
                    page.videos = []
                page.videos.append(video)
@ -598,13 +600,13 @@ class BrozzlerWorker:
            site_logger.info("no pages left for site")
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
-        except brozzler.ReachedTimeLimit as e:
+        except brozzler.ReachedTimeLimit:
            self._frontier.finished(site, "FINISHED_TIME_LIMIT")
        except brozzler.CrawlStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
-        except brozzler.ProxyError as e:
+        except brozzler.ProxyError:
            if self._warcprox_auto:
                self.logger.exception(
                    "proxy error, will try to choose a "
@ -676,7 +678,7 @@ class BrozzlerWorker:
        try:
            self.status_info = self._service_registry.heartbeat(status_info)
            self.logger.debug("status in service registry", status=self.status_info)
-        except r.ReqlError as e:
+        except r.ReqlError:
            self.logger.exception(
                "failed to send heartbeat and update service registry",
                info=status_info,
@ -748,11 +750,11 @@ class BrozzlerWorker:
                time.sleep(0.5)
            self.logger.warn("shutdown requested")
-        except r.ReqlError as e:
+        except r.ReqlError:
            self.logger.exception("caught rethinkdb exception, will try to proceed")
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
-        except:
+        except:  # noqa: E722
            self.logger.critical(
                "thread exiting due to unexpected exception", exc_info=True
            )
@ -760,7 +762,7 @@ class BrozzlerWorker:
            if self._service_registry and hasattr(self, "status_info"):
                try:
                    self._service_registry.unregister(self.status_info["id"])
-                except:
+                except:  # noqa: E722
                    self.logger.exception("failed to unregister from service registry")
            self.logger.info(
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
            if result_type in ("url", "url_transparent"):
                if "extraction_depth" in extra_info:
                    self.logger.info(
-                        f"Following redirect",
+                        "Following redirect",
                        redirect_url=ie_result["url"],
                        extraction_depth=extra_info["extraction_depth"],
                    )
@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                    # use it later to extract the watch pages as outlinks.
                    try:
                        ie_result["entries_no_dl"] = list(ie_result["entries"])
-                    except Exception as e:
+                    except Exception:
                        extract_context.warning(
                            "failed to unroll entries ie_result['entries']?",
                            exc_info=True,
@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                    import magic
                    mimetype = magic.from_file(info_dict["filepath"], mime=True)
-                except ImportError as e:
+                except ImportError:
                    mimetype = "video/%s" % info_dict["ext"]
                    self.logger.warning(
                        "guessing mimetype due to error",
@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                )
                site.last_claimed = doublethink.utcnow()
                site.save()
-        except:
+        except:  # noqa: E722
            worker.logger.debug(
                "problem heartbeating site.last_claimed site",
                id=site.id,
@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None):
    """
    Saves info about videos captured by yt-dlp in `page.videos`.
    """
-    if not "videos" in page:
+    if "videos" not in page:
        page.videos = []
    for pushed_video in pushed_videos or []:
        video = {
@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page):
                )
            metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
            break
-        except brozzler.ShutdownRequested as e:
+        except brozzler.ShutdownRequested:
            raise
        except Exception as e:
            if (
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -110,7 +110,7 @@ def test_httpd(httpd):
    of the same url return the same payload, proving it can be used to test
    deduplication.
    """
-    payload1 = content2 = None
+    payload1 = None
    url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
@ -175,7 +175,6 @@ def test_420(httpd):
 def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = "http://localhost:%s/site4/alert.html" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -40,11 +40,11 @@ def cli_commands():
    commands = set(console_scripts().keys())
    commands.remove("brozzler-wayback")
    try:
-        import gunicorn
+        import gunicorn  # noqa: F401
    except ImportError:
        commands.remove("brozzler-dashboard")
    try:
-        import pywb
+        import pywb  # noqa: F401
    except ImportError:
        commands.remove("brozzler-easy")
    return commands
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -23,7 +23,6 @@ import http.server
 import os
 import socket
 import subprocess
 import sys
 import threading
 import time
 import urllib.request
@ -47,7 +46,7 @@ def _local_address():
    try:
        s.connect(("10.255.255.255", 1))  # ip doesn't need to be reachable
        return s.getsockname()[0]
-    except:
+    except:  # noqa: E722
        return "127.0.0.1"
    finally:
        s.close()
@ -148,7 +147,7 @@ def test_httpd(httpd):
    of the same url return the same payload, proving it can be used to test
    deduplication.
    """
-    payload1 = content2 = None
+    payload1 = None
    url = make_url(httpd, "/site1/file1.txt")
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
@ -351,8 +350,8 @@ def test_warcprox_auto(httpd):
 def test_proxy_conflict():
-    with pytest.raises(AssertionError) as excinfo:
+    with pytest.raises(AssertionError):
-        worker = brozzler.worker.BrozzlerWorker(
+        brozzler.worker.BrozzlerWorker(
            None, None, warcprox_auto=True, proxy="localhost:12345"
        )
@ -523,7 +522,6 @@ def test_login(httpd):
    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = make_url(httpd, "/robots.txt")
    captures = list(
        rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
    )
@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd):
 def test_stop_crawl(httpd):
    test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker("localhost", db="brozzler")
    frontier = brozzler.RethinkDbFrontier(rr)
@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd):
    """
    rr = doublethink.Rethinker("localhost", db="brozzler")
    frontier = brozzler.RethinkDbFrontier(rr)
    svcreg = doublethink.ServiceRegistry(rr)
    # run two instances of warcprox
    opts = warcprox.Options()
@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
        # the system, if any
        try:
            stop_service("warcprox")
-        except Exception as e:
+        except Exception:
            logger.warning("problem stopping warcprox service: %s", exc_info=True)
        # queue the site for brozzling
@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd):
 def test_time_limit(httpd):
    test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker("localhost", db="brozzler")
    frontier = brozzler.RethinkDbFrontier(rr)
@ -928,7 +923,6 @@ def test_time_limit(httpd):
    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
    site = sites[0]
    # time limit should be enforced pretty soon
    start = time.time()
@ -986,7 +980,7 @@ def test_ydl_stitching(httpd):
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = list(rr.table("captures").filter({"test_id": test_id}).run())
-    l = [c for c in captures if c["url"] == stitched_url]
+    l = [c for c in captures if c["url"] == stitched_url]  # noqa: E741
    assert len(l) == 1
    c = l[0]
    assert c["filename"].startswith("test_ydl_stitching")
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -559,7 +559,7 @@ def test_parent_url_scoping():
    assert parent_page.outlinks["accepted"] == outlinks
    # parent page redirect_url matches accept parent_url_regex
-    parent_page_c = brozzler.Page(
+    brozzler.Page(
        rr,
        {
            "site_id": site.id,
@ -606,7 +606,7 @@ def test_parent_url_scoping():
    assert parent_page.outlinks["accepted"] == []
    # parent page redirect_url matches block parent_url_regex
-    parent_page_c = brozzler.Page(
+    brozzler.Page(
        rr,
        {
            "site_id": site.id,
@ -659,10 +659,10 @@ def test_completed_page():
        ]
    }
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    page.refresh()
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    # redirect that doesn't change scope surt because destination is covered by
    # the original surt
@ -686,10 +686,10 @@ def test_completed_page():
    site.refresh()
    assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    page.refresh()
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    # redirect that doesn't change scope surt because page is not the seed page
    site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
@ -712,10 +712,10 @@ def test_completed_page():
    site.refresh()
    assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    page.refresh()
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
 def test_seed_page():
@ -931,7 +931,7 @@ def test_max_claimed_sites():
    claimed_sites = frontier.claim_sites(3)
    assert len(claimed_sites) == 2
    with pytest.raises(brozzler.NothingToClaim):
-        claimed_site = frontier.claim_sites(3)
+        frontier.claim_sites(3)
    # clean slate for the next one
    rr.table("jobs").delete().run()
@ -1074,7 +1074,7 @@ def test_max_hops_off():
    site.refresh()  # get it back from the db
    # renamed this param
-    assert not "max_hops_off_surt" in site.scope
+    assert "max_hops_off_surt" not in site.scope
    assert site.scope["max_hops_off"] == 1
    seed_page = frontier.seed_page(site.id)
@ -1109,7 +1109,7 @@ def test_max_hops_off():
    assert len(pages) == 4
    assert pages[0].url == "http://example.com/"
    assert pages[0].hops_off == 0
-    assert not "hops_off_surt" in pages[0]
+    assert "hops_off_surt" not in pages[0]
    assert set(pages[0].outlinks["accepted"]) == {
        "https://example.com/toot",
        "http://foo.org/",
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -21,7 +21,6 @@ import datetime
 import http.server
 import os
 import socket
 import sys
 import tempfile
 import threading
 import time
@ -29,7 +28,6 @@ import uuid
 from unittest import mock
 import pytest
 import requests
 import yaml
 import brozzler
@ -291,7 +289,7 @@ def test_proxy_down():
            )
        # youtube-dl fetch
-        with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
+        with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
            with pytest.raises(brozzler.ProxyError):
                brozzler.ydl.do_youtube_dl(worker, site, page)
@ -315,7 +313,7 @@ def test_start_stop_backwards_compat():
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]["start"]
    assert site.starts_and_stops[0]["stop"] is None
-    assert not "start_time" in site
+    assert "start_time" not in site
    site = brozzler.Site(
        None,
@ -324,13 +322,13 @@ def test_start_stop_backwards_compat():
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
    assert site.starts_and_stops[0]["stop"] is None
-    assert not "start_time" in site
+    assert "start_time" not in site
    job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
    assert job.starts_and_stops[0]["start"]
    assert job.starts_and_stops[0]["stop"] is None
-    assert not "started" in job
+    assert "started" not in job
-    assert not "finished" in job
+    assert "finished" not in job
    job = brozzler.Job(
        None,
@ -342,8 +340,8 @@ def test_start_stop_backwards_compat():
    )
    assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
    assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
-    assert not "started" in job
+    assert "started" not in job
-    assert not "finished" in job
+    assert "finished" not in job
 class Exception1(Exception):
@ -452,9 +450,9 @@ def test_thread_raise_second_with_block():
            with brozzler.thread_accept_exceptions():
                time.sleep(2)
            return  # test fails
-        except Exception1 as e:
+        except Exception1:
            pass
-        except:
+        except:  # noqa: E722
            return  # fail test
        try:
--- a/vagrant/vagrant-brozzler-new-site.py
+++ b/vagrant/vagrant-brozzler-new-site.py
@ -32,7 +32,7 @@ import sys
 try:
    from shlex import quote
-except:
+except:  # noqa: E722
    from pipes import quote