ruff linting fixes (#343)

* ruff linting fixes * move imports back down to where they're re-exported
2025-07-02 02:26:48 -04:00 · 2025-03-07 16:03:35 -08:00 · 2025-03-07 16:03:35 -08:00 · f64db214d4
commit f64db214d4
parent 6f011cc6c8
18 changed files with 155 additions and 190 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -17,10 +17,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

+import datetime
 import logging
+import threading
 from importlib.metadata import version as _version

 import structlog
+import urlcanon

 __version__ = _version("brozzler")

@ -91,7 +94,7 @@ def _logging_handler_handle(self, record):
        finally:
            try:
                self.release()
-            except:
+            except:  # noqa: E722
                pass
    return rv

@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None):
    `js-templates/`. Defaults to brozzler dir.
    """
    import os
-    import string

    import yaml

@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
    """
    Returns the javascript behavior string populated with template_parameters.
    """
-    import json
    import re

    logger = structlog.get_logger(logger_name=__name__)
@ -194,8 +195,6 @@ class ThreadExceptionGate:
        return "<ThreadExceptionGate(%s)>" % self.thread


-import threading
-
 _thread_exception_gates = {}
 _thread_exception_gates_lock = threading.Lock()

@ -225,7 +224,7 @@ def thread_exception_gate(thread=None):
        thread = threading.current_thread()

    with _thread_exception_gates_lock:
-        if not thread in _thread_exception_gates:
+        if thread not in _thread_exception_gates:
            _thread_exception_gates[thread] = ThreadExceptionGate(thread)

    return _thread_exception_gates[thread]
@ -252,7 +251,6 @@ def thread_raise(thread, exctype):
    """
    import ctypes
    import inspect
-    import threading

    import structlog

@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None):
    return _jinja2_env


-import urlcanon
-
-
 def _remove_query(url):
    url.question_mark = b""
    url.query = b""
@ -403,13 +398,10 @@ def suggest_default_chrome_exe():
    return "chromium-browser"


-import datetime
-
 EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)

-
-from brozzler.browser import Browser, BrowserPool, BrowsingException
-from brozzler.robots import is_permitted_by_robots
+from brozzler.browser import Browser, BrowserPool, BrowsingException  # noqa: E402
+from brozzler.robots import is_permitted_by_robots  # noqa: E402

 __all__ = [
    "is_permitted_by_robots",
@ -422,22 +414,25 @@ __all__ = [
    "suggest_default_chrome_exe",
 ]

+# TODO try using importlib.util.find_spec to test for dependency presence
+# rather than try/except on import.
+# See https://docs.astral.sh/ruff/rules/unused-import/#example
 try:
-    import doublethink
+    import doublethink  # noqa: F401

    # All of these imports use doublethink for real and are unsafe
    # to do if doublethink is unavailable.
-    from brozzler.frontier import RethinkDbFrontier
+    from brozzler.frontier import RethinkDbFrontier  # noqa: F401
    from brozzler.model import (
-        InvalidJobConf,
-        Job,
-        Page,
-        Site,
-        new_job,
-        new_job_file,
-        new_site,
+        InvalidJobConf,  # noqa: F401
+        Job,  # noqa: F401
+        Page,  # noqa: F401
+        Site,  # noqa: F401
+        new_job,  # noqa: F401
+        new_job_file,  # noqa: F401
+        new_site,  # noqa: F401
    )
-    from brozzler.worker import BrozzlerWorker
+    from brozzler.worker import BrozzlerWorker  # noqa: F401

    __all__.extend(
        [
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -18,7 +18,6 @@ limitations under the License.

 import base64
 import datetime
-import itertools
 import json
 import logging
 import socket
@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread):
    def _on_message(self, websock, message):
        try:
            self._handle_message(websock, message)
-        except:
+        except:  # noqa: E722
            self.logger.exception(
                "uncaught exception in _handle_message",
                message=message,
@ -430,7 +429,7 @@ class Browser:
                self.logger.info("shutting down websocket connection")
                try:
                    self.websock.close()
-                except BaseException as e:
+                except BaseException:
                    self.logger.exception(
                        "exception closing websocket", websocket=self.websock
                    )
@ -458,7 +457,7 @@ class Browser:
                        )

            self.websock_url = None
-        except:
+        except:  # noqa: E722
            self.logger.exception("problem stopping")

    def is_running(self):
@ -628,7 +627,7 @@ class Browser:
                jpeg_bytes = self.screenshot(full_page)
                on_screenshot(jpeg_bytes)
                return
-            except BrowsingTimeout as e:
+            except BrowsingTimeout:
                self.logger.exception("attempt %s/3", i + 1)

    def visit_hashtags(self, page_url, hashtags, outlinks):
@ -807,12 +806,12 @@ class Browser:
                if (
                    msg
                    and "result" in msg
-                    and not ("exceptionDetails" in msg["result"])
+                    and "exceptionDetails" not in msg["result"]
                    and not (
                        "wasThrown" in msg["result"] and msg["result"]["wasThrown"]
                    )
                    and "result" in msg["result"]
-                    and type(msg["result"]["result"]["value"]) == bool
+                    and isinstance(msg["result"]["result"]["value"], bool)
                    and msg["result"]["result"]["value"]
                ):
                    self.logger.info("behavior decided it has finished")
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -265,7 +265,7 @@ class Chrome:
                    return url
            except brozzler.ShutdownRequested:
                raise
-            except Exception as e:
+            except Exception:
                if time.time() - self._last_warning > 30:
                    url_logger.warning(
                        "problem accessing url (will keep trying until timeout)",
@ -325,7 +325,7 @@ class Chrome:
                    self.logger.debug(
                        "chrome pid %s STDERR %s", self.chrome_process.pid, buf
                    )
-        except:
+        except:  # noqa: E722
            self.logger.exception("unexpected exception")

    def stop(self):
@ -378,7 +378,7 @@ class Chrome:
            self.chrome_process.stderr.close()
            try:
                self._home_tmpdir.cleanup()
-            except:
+            except:  # noqa: E722
                self.logger.exception(
                    "exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
                )
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -23,12 +23,10 @@ import datetime
 import json
 import logging
 import os
-import re
 import signal
 import string
 import sys
 import threading
-import time
 import traceback
 import warnings

@ -397,9 +395,9 @@ def brozzle_page(argv=None):
            enable_youtube_dl=not worker._skip_youtube_dl,
        )
        logger.info("outlinks", outlinks=sorted(outlinks))
-    except brozzler.ReachedLimit as e:
+    except brozzler.ReachedLimit:
        logger.exception("reached limit")
-    except brozzler.PageInterstitialShown as e:
+    except brozzler.PageInterstitialShown:
        logger.exception("page interstitial shown")
    finally:
        browser.stop()
@ -661,7 +659,7 @@ def brozzler_worker(argv=None):
            logger.info(
                "dumping state (caught signal)\n%s", signal=signum, state=state_strs
            )
-        except BaseException as e:
+        except BaseException:
            logger.exception("exception dumping state")
        finally:
            signal.signal(signal.SIGQUIT, dump_state)
@ -672,11 +670,11 @@ def brozzler_worker(argv=None):
        try:
            # make set from seed IDs in SKIP_AV_SEEDS_FILE
            with open(SKIP_AV_SEEDS_FILE) as skips:
-                skip_av_seeds = {int(l) for l in skips.readlines()}
+                skip_av_seeds = {int(line) for line in skips.readlines()}
                logger.info(
                    "running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
                )
-        except Exception as e:
+        except Exception:
            skip_av_seeds = set()
            logger.info("running with empty skip_av_seeds")
        return skip_av_seeds
@ -686,13 +684,13 @@ def brozzler_worker(argv=None):
        try:
            # make list from file
            with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
-                ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
+                ytdlp_proxy_endpoints = [line for line in endpoints.readlines()]
                if ytdlp_proxy_endpoints:
                    logger.info(
                        "running with ytdlp proxy endpoints file",
                        ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
                    )
-        except Exception as e:
+        except Exception:
            ytdlp_proxy_endpoints = []
            logger.info("running with empty proxy endpoints file")
        return ytdlp_proxy_endpoints
@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None):
    configure_logging(args)

    rr = rethinker(args)
-    frontier = brozzler.RethinkDbFrontier(rr)
+    brozzler.RethinkDbFrontier(rr)
    if args.job:
        try:
            job_id = int(args.job)
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -17,9 +17,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

+import base64
+import os
 import sys

+import doublethink
+import rethinkdb as rdb
 import structlog
+import yaml

 logger = structlog.get_logger(logger_name=__name__)

@ -33,14 +38,6 @@ except ImportError as e:
        e,
    )
    sys.exit(1)
-import base64
-import importlib
-import json
-import os
-
-import doublethink
-import rethinkdb as rdb
-import yaml

 r = rdb.RethinkDB()

@ -285,6 +282,8 @@ def root(path):


 try:
+    import logging
+
    import gunicorn.app.base
    import gunicorn.glogging
    from gunicorn.six import iteritems
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -18,10 +18,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

+import argparse
+import os
+import signal
+import socket
+import socketserver
 import sys
+import threading
+import time
+import traceback

+import doublethink
 import structlog

+import brozzler
+import brozzler.cli
+
 logger = structlog.get_logger(logger_name=__name__)

 try:
@ -42,19 +54,6 @@ except ImportError as e:
        exc_info=True,
    )
    sys.exit(1)
-import argparse
-import os
-import signal
-import socket
-import socketserver
-import threading
-import time
-import traceback
-
-import doublethink
-
-import brozzler
-import brozzler.cli


 def _build_arg_parser(argv=None):
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -16,10 +16,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

-import datetime
-import random
-import time
-
 import doublethink
 import rethinkdb as rdb
 import structlog
@ -47,11 +43,11 @@ class RethinkDbFrontier:
        db_logger = self.logger.bind(dbname=self.rr.dbname)

        dbs = self.rr.db_list().run()
-        if not self.rr.dbname in dbs:
+        if self.rr.dbname not in dbs:
            db_logger.info("creating rethinkdb database")
            self.rr.db_create(self.rr.dbname).run()
        tables = self.rr.table_list().run()
-        if not "sites" in tables:
+        if "sites" not in tables:
            db_logger.info("creating rethinkdb table 'sites' in database")
            self.rr.table_create(
                "sites", shards=self.shards, replicas=self.replicas
@ -60,7 +56,7 @@ class RethinkDbFrontier:
                "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
            ).run()
            self.rr.table("sites").index_create("job_id").run()
-        if not "pages" in tables:
+        if "pages" not in tables:
            db_logger.info("creating rethinkdb table 'pages' in database")
            self.rr.table_create(
                "pages", shards=self.shards, replicas=self.replicas
@ -80,7 +76,7 @@ class RethinkDbFrontier:
                "least_hops",
                [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
            ).run()
-        if not "jobs" in tables:
+        if "jobs" not in tables:
            db_logger.info("creating rethinkdb table 'jobs' in database")
            self.rr.table_create(
                "jobs", shards=self.shards, replicas=self.replicas
@ -352,7 +348,6 @@ class RethinkDbFrontier:
        site.save()

    def _build_fresh_page(self, site, parent_page, url, hops_off=0):
-        url_for_scoping = urlcanon.semantic(url)
        url_for_crawling = urlcanon.whatwg(url)
        hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
            "utf-8"
@ -461,8 +456,8 @@ class RethinkDbFrontier:
        # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
        # there can be many pages and each one can be very large (many videos,
        # in and out of scope links, etc)
-        l = list(pages.values())
-        for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
+        pages_list = list(pages.values())
+        for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)):
            try:
                self.logger.debug("inserting/replacing batch of %s pages", len(batch))
                reql = self.rr.table("pages").insert(batch, conflict="replace")
@ -471,8 +466,8 @@ class RethinkDbFrontier:
                    'conflict="replace")',
                    batch,
                )
-                result = reql.run()
-            except Exception as e:
+                reql.run()
+            except Exception:
                self.logger.exception(
                    "problem inserting/replacing batch of %s pages",
                    len(batch),
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -19,12 +19,9 @@ limitations under the License.

 import base64
 import copy
-import datetime
 import hashlib
 import json
 import os
-import re
-import time
 import urllib
 import uuid
 import zlib
@ -61,7 +58,7 @@ class InvalidJobConf(Exception):
            # debugged, I found it here. Maybe there's a better way to see it.
            value = validator._errors[0].info[0][0].info[0][0].value
            self.errors["bad value"] = value
-        except:
+        except:  # noqa: E722
            value = None


@ -122,10 +119,10 @@ def new_job(frontier, job_conf):
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
        logger.info("inserting batch of %s pages", len(batch))
-        result = frontier.rr.table("pages").insert(batch).run()
+        frontier.rr.table("pages").insert(batch).run()
    for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
        logger.info("inserting batch of %s sites", len(batch))
-        result = frontier.rr.table("sites").insert(batch).run()
+        frontier.rr.table("sites").insert(batch).run()
    logger.info("job fully started", job_id=job.id)

    return job
@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn):
    table = "jobs"

    def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
            self.status = "ACTIVE"
-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
            if self.get("started"):  # backward compatibility
                self.starts_and_stops = [
                    {"start": self.get("started"), "stop": self.get("finished")}
@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn):
    table = "sites"

    def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
            self.status = "ACTIVE"
-        if not "claimed" in self:
+        if "claimed" not in self:
            self.claimed = False
-        if not "last_disclaimed" in self:
+        if "last_disclaimed" not in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
-        if not "last_claimed" in self:
+        if "last_claimed" not in self:
            self.last_claimed = brozzler.EPOCH_UTC
-        if not "scope" in self:
+        if "scope" not in self:
            self.scope = {}
-        if not "skip_ytdlp" in self:
+        if "skip_ytdlp" not in self:
            self.skip_ytdlp = None

        # backward compatibility
        if "surt" in self.scope:
-            if not "accepts" in self.scope:
+            if "accepts" not in self.scope:
                self.scope["accepts"] = []
            self.scope["accepts"].append({"surt": self.scope["surt"]})
            del self.scope["surt"]

        # backward compatibility
-        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
+        if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]
@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn):
                brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
            )

-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
            if self.get("start_time"):  # backward compatibility
                self.starts_and_stops = [
                    {"start": self.get("start_time"), "stop": None}
@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)

    def _accept_ssurt_if_not_redundant(self, ssurt):
-        if not "accepts" in self.scope:
+        if "accepts" not in self.scope:
            self.scope["accepts"] = []
        simple_rule_ssurts = (
            rule["ssurt"]
@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)

-        if not url.scheme in (b"http", b"https"):
+        if url.scheme not in (b"http", b"https"):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
@ -390,31 +387,31 @@ class Page(doublethink.Document):
        return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()

    def populate_defaults(self):
-        if not "retry_after" in self:
+        if "retry_after" not in self:
            self.retry_after = None
-        if not "failed_attempts" in self:
+        if "failed_attempts" not in self:
            self.failed_attempts = 0
-        if not "hops_from_seed" in self:
+        if "hops_from_seed" not in self:
            self.hops_from_seed = 0
-        if not "hop_path" in self:
+        if "hop_path" not in self:
            self.hop_path = None
-        if not "via_page_url" in self:
+        if "via_page_url" not in self:
            self.via_page_url = None
-        if not "brozzle_count" in self:
+        if "brozzle_count" not in self:
            self.brozzle_count = 0
-        if not "claimed" in self:
+        if "claimed" not in self:
            self.claimed = False
-        if "hops_off_surt" in self and not "hops_off" in self:
+        if "hops_off_surt" in self and "hops_off" not in self:
            self.hops_off = self.hops_off_surt
        if "hops_off_surt" in self:
            del self["hops_off_surt"]
-        if not "hops_off" in self:
+        if "hops_off" not in self:
            self.hops_off = 0
-        if not "needs_robots_check" in self:
+        if "needs_robots_check" not in self:
            self.needs_robots_check = False
-        if not "priority" in self:
+        if "priority" not in self:
            self.priority = self._calc_priority()
-        if not "id" in self:
+        if "id" not in self:
            self.id = self.compute_id(self.site_id, self.url)

    def __str__(self):
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -18,9 +18,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

+import argparse
+import json
 import sys

+import doublethink
+import rethinkdb as rdb
 import structlog
+import urlcanon
+
+import brozzler

 logger = structlog.get_logger(logger_name=__name__)

@ -40,14 +47,7 @@ except ImportError as e:
        e,
    )
    sys.exit(1)
-import argparse
-import json

-import doublethink
-import rethinkdb as rdb
-import urlcanon
-
-import brozzler

 r = rdb.RethinkDB()

@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object):
            key = urlcanon.semantic(url).surt().decode("ascii")
            # logging.debug('%s -> %s', url, key)
            return key
-        except Exception as e:
+        except Exception:
            return url

    def replace_default_canonicalizer():
@ -221,18 +221,9 @@ def support_in_progress_warcs():

 class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
    def __init__(self, orig_url):
-        import re
-
        import six
        from pywb.rewrite.wburl import WbUrl
-        from pywb.utils.loaders import to_native_str
-        from six.moves.urllib.parse import (
-            quote,
-            quote_plus,
-            unquote_plus,
-            urlsplit,
-            urlunsplit,
-        )
+        from six.moves.urllib.parse import quote

        pywb.rewrite.wburl.BaseWbUrl.__init__(self)

@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query):
    urlkey = to_native_str(query.key, "utf-8")
    url = query.url
    filter_ = query.filters
-    output = query.output

    for rule in self.rules.iter_matching(urlkey):
        m = rule.regex.search(urlkey)
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -71,7 +71,7 @@ _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}


 def _robots_cache(site, proxy=None):
-    if not site.id in _robots_caches:
+    if site.id not in _robots_caches:
        req_sesh = _SessionRaiseOn420()
        req_sesh.verify = False  # ignore cert errors
        if proxy:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -21,9 +21,7 @@ limitations under the License.
 import datetime
 import io
 import json
-import random
 import socket
-import tempfile
 import threading
 import time
 import urllib.request
@ -99,9 +97,13 @@ class BrozzlerWorker:
        self._skip_visit_hashtags = skip_visit_hashtags
        self._skip_youtube_dl = skip_youtube_dl

+        # TODO try using importlib.util.find_spec to test for dependency
+        # presence rather than try/except on import.
+        # See https://docs.astral.sh/ruff/rules/unused-import/#example
+
        # We definitely shouldn't ytdlp if the optional extra is missing
        try:
-            import yt_dlp
+            import yt_dlp  # noqa: F401
        except ImportError:
            self.logger.info(
                "optional yt-dlp extra not installed; setting skip_youtube_dl to True"
@ -200,7 +202,7 @@ class BrozzlerWorker:
                    response = requests.get("http://%s/status" % self._proxy)
                    status = json.loads(response.text)
                    self._proxy_is_warcprox = status["role"] == "warcprox"
-                except Exception as e:
+                except Exception:
                    self._proxy_is_warcprox = False
                self.logger.info(
                    "%s %s warcprox",
@ -348,13 +350,13 @@ class BrozzlerWorker:
                    )
                    metrics.brozzler_ydl_urls_checked.inc(1)
                    outlinks.update(ydl_outlinks)
-                except brozzler.ReachedLimit as e:
+                except brozzler.ReachedLimit:
                    raise
                except brozzler.ShutdownRequested:
                    raise
                except brozzler.ProxyError:
                    raise
-                except brozzler.VideoExtractorError as e:
+                except brozzler.VideoExtractorError:
                    self.logger.exception("error extracting video info")
                except Exception as e:
                    if (
@ -391,9 +393,9 @@ class BrozzlerWorker:
                timeout=self.HEADER_REQUEST_TIMEOUT,
            ) as r:
                return r.headers
-        except requests.exceptions.Timeout as e:
+        except requests.exceptions.Timeout:
            url_logger.warning("Timed out trying to get headers", exc_info=True)
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
            url_logger.warning("Failed to get headers", exc_info=True)
        return {}

@ -469,7 +471,7 @@ class BrozzlerWorker:
                if "content-range" in response_headers:
                    video["content-range"] = response_headers["content-range"]
                self.logger.debug("embedded video", video=video)
-                if not "videos" in page:
+                if "videos" not in page:
                    page.videos = []
                page.videos.append(video)

@ -598,13 +600,13 @@ class BrozzlerWorker:
            site_logger.info("no pages left for site")
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
-        except brozzler.ReachedTimeLimit as e:
+        except brozzler.ReachedTimeLimit:
            self._frontier.finished(site, "FINISHED_TIME_LIMIT")
        except brozzler.CrawlStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
-        except brozzler.ProxyError as e:
+        except brozzler.ProxyError:
            if self._warcprox_auto:
                self.logger.exception(
                    "proxy error, will try to choose a "
@ -676,7 +678,7 @@ class BrozzlerWorker:
        try:
            self.status_info = self._service_registry.heartbeat(status_info)
            self.logger.debug("status in service registry", status=self.status_info)
-        except r.ReqlError as e:
+        except r.ReqlError:
            self.logger.exception(
                "failed to send heartbeat and update service registry",
                info=status_info,
@ -748,11 +750,11 @@ class BrozzlerWorker:
                time.sleep(0.5)

            self.logger.warn("shutdown requested")
-        except r.ReqlError as e:
+        except r.ReqlError:
            self.logger.exception("caught rethinkdb exception, will try to proceed")
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
-        except:
+        except:  # noqa: E722
            self.logger.critical(
                "thread exiting due to unexpected exception", exc_info=True
            )
@ -760,7 +762,7 @@ class BrozzlerWorker:
            if self._service_registry and hasattr(self, "status_info"):
                try:
                    self._service_registry.unregister(self.status_info["id"])
-                except:
+                except:  # noqa: E722
                    self.logger.exception("failed to unregister from service registry")

            self.logger.info(
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
            if result_type in ("url", "url_transparent"):
                if "extraction_depth" in extra_info:
                    self.logger.info(
-                        f"Following redirect",
+                        "Following redirect",
                        redirect_url=ie_result["url"],
                        extraction_depth=extra_info["extraction_depth"],
                    )
@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                    # use it later to extract the watch pages as outlinks.
                    try:
                        ie_result["entries_no_dl"] = list(ie_result["entries"])
-                    except Exception as e:
+                    except Exception:
                        extract_context.warning(
                            "failed to unroll entries ie_result['entries']?",
                            exc_info=True,
@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                    import magic

                    mimetype = magic.from_file(info_dict["filepath"], mime=True)
-                except ImportError as e:
+                except ImportError:
                    mimetype = "video/%s" % info_dict["ext"]
                    self.logger.warning(
                        "guessing mimetype due to error",
@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                )
                site.last_claimed = doublethink.utcnow()
                site.save()
-        except:
+        except:  # noqa: E722
            worker.logger.debug(
                "problem heartbeating site.last_claimed site",
                id=site.id,
@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None):
    """
    Saves info about videos captured by yt-dlp in `page.videos`.
    """
-    if not "videos" in page:
+    if "videos" not in page:
        page.videos = []
    for pushed_video in pushed_videos or []:
        video = {
@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page):
                )
            metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
            break
-        except brozzler.ShutdownRequested as e:
+        except brozzler.ShutdownRequested:
            raise
        except Exception as e:
            if (
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -110,7 +110,7 @@ def test_httpd(httpd):
    of the same url return the same payload, proving it can be used to test
    deduplication.
    """
-    payload1 = content2 = None
+    payload1 = None
    url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
@ -175,7 +175,6 @@ def test_420(httpd):

 def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = "http://localhost:%s/site4/alert.html" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -40,11 +40,11 @@ def cli_commands():
    commands = set(console_scripts().keys())
    commands.remove("brozzler-wayback")
    try:
-        import gunicorn
+        import gunicorn  # noqa: F401
    except ImportError:
        commands.remove("brozzler-dashboard")
    try:
-        import pywb
+        import pywb  # noqa: F401
    except ImportError:
        commands.remove("brozzler-easy")
    return commands
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -23,7 +23,6 @@ import http.server
 import os
 import socket
 import subprocess
-import sys
 import threading
 import time
 import urllib.request
@ -47,7 +46,7 @@ def _local_address():
    try:
        s.connect(("10.255.255.255", 1))  # ip doesn't need to be reachable
        return s.getsockname()[0]
-    except:
+    except:  # noqa: E722
        return "127.0.0.1"
    finally:
        s.close()
@ -148,7 +147,7 @@ def test_httpd(httpd):
    of the same url return the same payload, proving it can be used to test
    deduplication.
    """
-    payload1 = content2 = None
+    payload1 = None
    url = make_url(httpd, "/site1/file1.txt")
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
@ -351,8 +350,8 @@ def test_warcprox_auto(httpd):


 def test_proxy_conflict():
-    with pytest.raises(AssertionError) as excinfo:
-        worker = brozzler.worker.BrozzlerWorker(
+    with pytest.raises(AssertionError):
+        brozzler.worker.BrozzlerWorker(
            None, None, warcprox_auto=True, proxy="localhost:12345"
        )

@ -523,7 +522,6 @@ def test_login(httpd):

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
-    robots_url = make_url(httpd, "/robots.txt")
    captures = list(
        rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
    )
@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd):


 def test_stop_crawl(httpd):
-    test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker("localhost", db="brozzler")
    frontier = brozzler.RethinkDbFrontier(rr)

@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd):
    """
    rr = doublethink.Rethinker("localhost", db="brozzler")
    frontier = brozzler.RethinkDbFrontier(rr)
-    svcreg = doublethink.ServiceRegistry(rr)

    # run two instances of warcprox
    opts = warcprox.Options()
@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
        # the system, if any
        try:
            stop_service("warcprox")
-        except Exception as e:
+        except Exception:
            logger.warning("problem stopping warcprox service: %s", exc_info=True)

        # queue the site for brozzling
@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd):


 def test_time_limit(httpd):
-    test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker("localhost", db="brozzler")
    frontier = brozzler.RethinkDbFrontier(rr)

@ -928,7 +923,6 @@ def test_time_limit(httpd):

    sites = list(frontier.job_sites(job.id))
    assert len(sites) == 1
-    site = sites[0]

    # time limit should be enforced pretty soon
    start = time.time()
@ -986,7 +980,7 @@ def test_ydl_stitching(httpd):
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = list(rr.table("captures").filter({"test_id": test_id}).run())
-    l = [c for c in captures if c["url"] == stitched_url]
+    l = [c for c in captures if c["url"] == stitched_url]  # noqa: E741
    assert len(l) == 1
    c = l[0]
    assert c["filename"].startswith("test_ydl_stitching")
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -559,7 +559,7 @@ def test_parent_url_scoping():
    assert parent_page.outlinks["accepted"] == outlinks

    # parent page redirect_url matches accept parent_url_regex
-    parent_page_c = brozzler.Page(
+    brozzler.Page(
        rr,
        {
            "site_id": site.id,
@ -606,7 +606,7 @@ def test_parent_url_scoping():
    assert parent_page.outlinks["accepted"] == []

    # parent page redirect_url matches block parent_url_regex
-    parent_page_c = brozzler.Page(
+    brozzler.Page(
        rr,
        {
            "site_id": site.id,
@ -659,10 +659,10 @@ def test_completed_page():
        ]
    }
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    page.refresh()
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False

    # redirect that doesn't change scope surt because destination is covered by
    # the original surt
@ -686,10 +686,10 @@ def test_completed_page():
    site.refresh()
    assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    page.refresh()
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False

    # redirect that doesn't change scope surt because page is not the seed page
    site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
@ -712,10 +712,10 @@ def test_completed_page():
    site.refresh()
    assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False
    page.refresh()
    assert page.brozzle_count == 1
-    assert page.claimed == False
+    assert page.claimed is False


 def test_seed_page():
@ -931,7 +931,7 @@ def test_max_claimed_sites():
    claimed_sites = frontier.claim_sites(3)
    assert len(claimed_sites) == 2
    with pytest.raises(brozzler.NothingToClaim):
-        claimed_site = frontier.claim_sites(3)
+        frontier.claim_sites(3)

    # clean slate for the next one
    rr.table("jobs").delete().run()
@ -1074,7 +1074,7 @@ def test_max_hops_off():
    site.refresh()  # get it back from the db

    # renamed this param
-    assert not "max_hops_off_surt" in site.scope
+    assert "max_hops_off_surt" not in site.scope
    assert site.scope["max_hops_off"] == 1

    seed_page = frontier.seed_page(site.id)
@ -1109,7 +1109,7 @@ def test_max_hops_off():
    assert len(pages) == 4
    assert pages[0].url == "http://example.com/"
    assert pages[0].hops_off == 0
-    assert not "hops_off_surt" in pages[0]
+    assert "hops_off_surt" not in pages[0]
    assert set(pages[0].outlinks["accepted"]) == {
        "https://example.com/toot",
        "http://foo.org/",
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -21,7 +21,6 @@ import datetime
 import http.server
 import os
 import socket
-import sys
 import tempfile
 import threading
 import time
@ -29,7 +28,6 @@ import uuid
 from unittest import mock

 import pytest
-import requests
 import yaml

 import brozzler
@ -291,7 +289,7 @@ def test_proxy_down():
            )

        # youtube-dl fetch
-        with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
+        with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
            with pytest.raises(brozzler.ProxyError):
                brozzler.ydl.do_youtube_dl(worker, site, page)

@ -315,7 +313,7 @@ def test_start_stop_backwards_compat():
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]["start"]
    assert site.starts_and_stops[0]["stop"] is None
-    assert not "start_time" in site
+    assert "start_time" not in site

    site = brozzler.Site(
        None,
@ -324,13 +322,13 @@ def test_start_stop_backwards_compat():
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
    assert site.starts_and_stops[0]["stop"] is None
-    assert not "start_time" in site
+    assert "start_time" not in site

    job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
    assert job.starts_and_stops[0]["start"]
    assert job.starts_and_stops[0]["stop"] is None
-    assert not "started" in job
-    assert not "finished" in job
+    assert "started" not in job
+    assert "finished" not in job

    job = brozzler.Job(
        None,
@ -342,8 +340,8 @@ def test_start_stop_backwards_compat():
    )
    assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
    assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
-    assert not "started" in job
-    assert not "finished" in job
+    assert "started" not in job
+    assert "finished" not in job


 class Exception1(Exception):
@ -452,9 +450,9 @@ def test_thread_raise_second_with_block():
            with brozzler.thread_accept_exceptions():
                time.sleep(2)
            return  # test fails
-        except Exception1 as e:
+        except Exception1:
            pass
-        except:
+        except:  # noqa: E722
            return  # fail test

        try:
--- a/vagrant/vagrant-brozzler-new-site.py
+++ b/vagrant/vagrant-brozzler-new-site.py
@ -32,7 +32,7 @@ import sys

 try:
    from shlex import quote
-except:
+except:  # noqa: E722
    from pipes import quote