Merge pull request #271 from internetarchive/avdempsey/use-black

Use black, enforce with GitHub Actions
2025-04-19 23:35:54 -04:00 · 2024-02-08 22:35:27 -08:00 · 2024-02-08 22:35:27 -08:00 · 955cae6421
commit 955cae6421
parent c4620c3018 8b23430a87
23 changed files with 4048 additions and 2797 deletions
--- a/.github/workflows/python-formatting.yml
+++ b/.github/workflows/python-formatting.yml
@ -0,0 +1,31 @@
+name: Python Formatting Check
+
+on:
+  push:
+    branches:
+      - main
+      - master
+  pull_request:
+    branches:
+      - main
+      - master
+
+jobs:
+  formatting:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.8'
+    - name: Create virtual environment
+      run: python -m venv venv
+
+    - name: Install black
+      run: |
+        ./venv/bin/pip install --upgrade pip
+        ./venv/bin/pip install black
+
+    - name: Run formatting check
+      run: make ck-format
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,5 @@
 *.diff
 .*.sw*
 /brozzler.egg-info/
+venv
+.idea
--- a/7
+++ b/7
@ -0,0 +1,7 @@
+.PHONY: format
+format:
+	venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
+
+.PHONY: ck-format
+ck-format:
+	venv/bin/black --check .
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -19,33 +19,41 @@ limitations under the License.

 import logging
 from pkg_resources import get_distribution as _get_distribution
-__version__ = _get_distribution('brozzler').version
+
+__version__ = _get_distribution("brozzler").version
+

 class ShutdownRequested(Exception):
    pass

+
 class NothingToClaim(Exception):
    pass

+
 class CrawlStopped(Exception):
    pass

+
 class PageInterstitialShown(Exception):
    pass

+
 class ProxyError(Exception):
    pass

+
 class ReachedTimeLimit(Exception):
    pass

+
 class ReachedLimit(Exception):
    def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
        import json
+
        if http_error:
            if "warcprox-meta" in http_error.headers:
-                self.warcprox_meta = json.loads(
-                        http_error.headers["warcprox-meta"])
+                self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"])
            else:
                self.warcprox_meta = None
            self.http_payload = http_error.read()
@ -55,28 +63,39 @@ class ReachedLimit(Exception):

    def __repr__(self):
        return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
-                self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
-                self.http_payload if hasattr(self, 'http_payload') else None)
+            self.warcprox_meta if hasattr(self, "warcprox_meta") else None,
+            self.http_payload if hasattr(self, "http_payload") else None,
+        )

    def __str__(self):
        return self.__repr__()

+
 # monkey-patch log levels TRACE and NOTICE
 logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
+
+
 def _logger_trace(self, msg, *args, **kwargs):
    if self.isEnabledFor(logging.TRACE):
        self._log(logging.TRACE, msg, args, **kwargs)
+
+
 logging.Logger.trace = _logger_trace
 logging.trace = logging.root.trace
-logging.addLevelName(logging.TRACE, 'TRACE')
+logging.addLevelName(logging.TRACE, "TRACE")

 logging.NOTICE = (logging.INFO + logging.WARN) // 2
+
+
 def _logger_notice(self, msg, *args, **kwargs):
    if self.isEnabledFor(logging.NOTICE):
        self._log(logging.NOTICE, msg, args, **kwargs)
+
+
 logging.Logger.notice = _logger_notice
 logging.notice = logging.root.notice
-logging.addLevelName(logging.NOTICE, 'NOTICE')
+logging.addLevelName(logging.NOTICE, "NOTICE")
+

 # see https://github.com/internetarchive/brozzler/issues/91
 def _logging_handler_handle(self, record):
@ -91,9 +110,13 @@ def _logging_handler_handle(self, record):
            except:
                pass
    return rv
+
+
 logging.Handler.handle = _logging_handler_handle

 _behaviors = None
+
+
 def behaviors(behaviors_dir=None):
    """Return list of JS behaviors loaded from YAML file.

@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None):
    `js-templates/`. Defaults to brozzler dir.
    """
    import os, yaml, string
+
    global _behaviors
    if _behaviors is None:
        d = behaviors_dir or os.path.dirname(__file__)
-        behaviors_yaml = os.path.join(d, 'behaviors.yaml')
+        behaviors_yaml = os.path.join(d, "behaviors.yaml")
        with open(behaviors_yaml) as fin:
            _behaviors = yaml.safe_load(fin)
    return _behaviors

+
 def behavior_script(url, template_parameters=None, behaviors_dir=None):
-    '''
+    """
    Returns the javascript behavior string populated with template_parameters.
-    '''
+    """
    import re, logging, json
+
    for behavior in behaviors(behaviors_dir=behaviors_dir):
-        if re.match(behavior['url_regex'], url):
+        if re.match(behavior["url_regex"], url):
            parameters = dict()
-            if 'default_parameters' in behavior:
-                parameters.update(behavior['default_parameters'])
+            if "default_parameters" in behavior:
+                parameters.update(behavior["default_parameters"])
            if template_parameters:
                parameters.update(template_parameters)
            template = jinja2_environment(behaviors_dir).get_template(
-                    behavior['behavior_js_template'])
+                behavior["behavior_js_template"]
+            )
            script = template.render(parameters)
            logging.info(
-                    'using template=%r populated with parameters=%r for %r',
-                    behavior['behavior_js_template'], json.dumps(parameters), url)
+                "using template=%r populated with parameters=%r for %r",
+                behavior["behavior_js_template"],
+                json.dumps(parameters),
+                url,
+            )
            return script
    return None

+
 class ThreadExceptionGate:
    logger = logging.getLogger(__module__ + "." + __qualname__)

@ -142,8 +173,7 @@ class ThreadExceptionGate:
    def __enter__(self):
        assert self.thread == threading.current_thread()
        if self.pending_exception:
-            self.logger.info(
-                    'raising pending exception %s', self.pending_exception)
+            self.logger.info("raising pending exception %s", self.pending_exception)
            tmp = self.pending_exception
            self.pending_exception = None
            raise tmp
@ -154,25 +184,32 @@ class ThreadExceptionGate:
    def __exit__(self, exc_type, exc_value, traceback):
        assert self.thread == threading.current_thread()
        self.ok_to_raise.clear()
-        return False # don't swallow exception
+        return False  # don't swallow exception

    def queue_exception(self, e):
        with self.lock:
            if self.pending_exception:
                self.logger.warning(
-                        '%r already pending for thread %r, discarding %r',
-                        self.pending_exception, self.thread, e)
+                    "%r already pending for thread %r, discarding %r",
+                    self.pending_exception,
+                    self.thread,
+                    e,
+                )
            else:
                self.pending_exception = e

    def __repr__(self):
-        return '<ThreadExceptionGate(%s)>' % self.thread
+        return "<ThreadExceptionGate(%s)>" % self.thread
+

 import threading
+
 _thread_exception_gates = {}
 _thread_exception_gates_lock = threading.Lock()
+
+
 def thread_exception_gate(thread=None):
-    '''
+    """
    Returns a `ThreadExceptionGate` for `thread` (current thread by default).

    `ThreadExceptionGate` is a context manager which allows exceptions to be
@ -191,7 +228,7 @@ def thread_exception_gate(thread=None):
    is queued, and raised immediately if and when the thread enters the
    context. Only one exception will be queued this way at a time, others are
    discarded.
-    '''
+    """
    if not thread:
        thread = threading.current_thread()

@ -201,10 +238,12 @@ def thread_exception_gate(thread=None):

    return _thread_exception_gates[thread]

+
 thread_accept_exceptions = thread_exception_gate

+
 def thread_raise(thread, exctype):
-    '''
+    """
    Raises or queues the exception `exctype` for the thread `thread`.

    See the documentation on the function `thread_exception_gate()` for more
@ -218,40 +257,43 @@ def thread_raise(thread, exctype):
    Raises:
        TypeError if `exctype` is not a class
        ValueError, SystemError in case of unexpected problems
-    '''
+    """
    import ctypes, inspect, threading, logging

    if not inspect.isclass(exctype):
        raise TypeError(
-                'cannot raise %s, only exception types can be raised (not '
-                'instances)' % exctype)
+            "cannot raise %s, only exception types can be raised (not "
+            "instances)" % exctype
+        )

    gate = thread_exception_gate(thread)
    with gate.lock:
        if gate.ok_to_raise.is_set() and thread.is_alive():
            gate.ok_to_raise.clear()
-            logging.info('raising %s in thread %s', exctype, thread)
+            logging.info("raising %s in thread %s", exctype, thread)
            res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
-                    ctypes.c_long(thread.ident), ctypes.py_object(exctype))
+                ctypes.c_long(thread.ident), ctypes.py_object(exctype)
+            )
            if res == 0:
-                raise ValueError(
-                        'invalid thread id? thread.ident=%s' % thread.ident)
+                raise ValueError("invalid thread id? thread.ident=%s" % thread.ident)
            elif res != 1:
                # if it returns a number greater than one, you're in trouble,
                # and you should call it again with exc=NULL to revert the effect
                ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
-                raise SystemError('PyThreadState_SetAsyncExc failed')
+                raise SystemError("PyThreadState_SetAsyncExc failed")
        else:
-            logging.info('queueing %s for thread %s', exctype, thread)
+            logging.info("queueing %s for thread %s", exctype, thread)
            gate.queue_exception(exctype)

+
 def sleep(duration):
-    '''
+    """
    Sleeps for duration seconds in increments of 0.5 seconds.

    Use this so that the sleep can be interrupted by thread_raise().
-    '''
+    """
    import time
+
    start = time.time()
    while True:
        elapsed = time.time() - start
@ -259,32 +301,41 @@ def sleep(duration):
            break
        time.sleep(min(duration - elapsed, 0.5))

+
 _jinja2_env = None
+
+
 def jinja2_environment(behaviors_dir=None):
    global _jinja2_env
    if not _jinja2_env:
        import os, jinja2, json
+
        if behaviors_dir:
-            _loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
-                                                           'js-templates'))
+            _loader = jinja2.FileSystemLoader(
+                os.path.join(behaviors_dir, "js-templates")
+            )
        else:
-            _loader=jinja2.PackageLoader('brozzler', 'js-templates')
+            _loader = jinja2.PackageLoader("brozzler", "js-templates")
        _jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
-        _jinja2_env.filters['json'] = json.dumps
+        _jinja2_env.filters["json"] = json.dumps
    return _jinja2_env

+
 import urlcanon
+
+
 def _remove_query(url):
-    url.question_mark = b''
-    url.query = b''
+    url.question_mark = b""
+    url.query = b""
+
+
 # XXX chop off path after last slash??
-site_surt_canon = urlcanon.Canonicalizer(
-        urlcanon.semantic.steps + [_remove_query])
+site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])

 import doublethink
 import datetime
-EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
-        tzinfo=doublethink.UTC)
+
+EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)

 # we could make this configurable if there's a good reason
 MAX_PAGE_FAILURES = 3
@ -294,10 +345,31 @@ from brozzler.robots import is_permitted_by_robots
 from brozzler.frontier import RethinkDbFrontier
 from brozzler.browser import Browser, BrowserPool, BrowsingException
 from brozzler.model import (
-        new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf)
+    new_job,
+    new_job_file,
+    new_site,
+    Job,
+    Page,
+    Site,
+    InvalidJobConf,
+)
 from brozzler.cli import suggest_default_chrome_exe

-__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
-           'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
-           'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf',
-           'sleep', 'thread_accept_exceptions', 'thread_raise']
+__all__ = [
+    "Page",
+    "Site",
+    "BrozzlerWorker",
+    "is_permitted_by_robots",
+    "RethinkDbFrontier",
+    "Browser",
+    "BrowserPool",
+    "BrowsingException",
+    "new_job",
+    "new_site",
+    "Job",
+    "new_job_file",
+    "InvalidJobConf",
+    "sleep",
+    "thread_accept_exceptions",
+    "thread_raise",
+]
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/chrome.py - manages the chrome/chromium browser for brozzler

 Copyright (C) 2014-2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import logging
 import urllib.request
@ -31,39 +31,43 @@ import json
 import tempfile
 import sys

+
 def check_version(chrome_exe):
-    '''
+    """
    Raises SystemExit if `chrome_exe` is not a supported browser version.

    Must run in the main thread to have the desired effect.
-    '''
+    """
    # mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
-    # Google Chrome 64.0.3282.140 
+    # Google Chrome 64.0.3282.140
    # mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
    # Google Chrome 66.0.3341.0 canary
    # linux$ chromium-browser --version
    # Using PPAPI flash.
    #  --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
    # Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
-    cmd = [chrome_exe, '--version']
+    cmd = [chrome_exe, "--version"]
    out = subprocess.check_output(cmd, timeout=60)
-    m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
+    m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out)
    if not m:
        sys.exit(
-                'unable to parse browser version from output of '
-                '%r: %r' % (subprocess.list2cmdline(cmd), out))
+            "unable to parse browser version from output of "
+            "%r: %r" % (subprocess.list2cmdline(cmd), out)
+        )
    version_str = m.group(2).decode()
-    major_version = int(version_str.split('.')[0])
+    major_version = int(version_str.split(".")[0])
    if major_version < 64:
-        sys.exit('brozzler requires chrome/chromium version 64 or '
-                 'later but %s reports version %s' % (
-                     chrome_exe, version_str))
+        sys.exit(
+            "brozzler requires chrome/chromium version 64 or "
+            "later but %s reports version %s" % (chrome_exe, version_str)
+        )
+

 class Chrome:
-    logger = logging.getLogger(__module__ + '.' + __qualname__)
+    logger = logging.getLogger(__module__ + "." + __qualname__)

    def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
-        '''
+        """
        Initializes instance of this class.

        Doesn't start the browser, start() does that.
@ -73,7 +77,7 @@ class Chrome:
            port: chrome debugging protocol port (default 9222)
            ignore_cert_errors: configure chrome to accept all certs (default
                False)
-        '''
+        """
        self.port = port
        self.chrome_exe = chrome_exe
        self.ignore_cert_errors = ignore_cert_errors
@ -81,63 +85,72 @@ class Chrome:
        self.chrome_process = None

    def __enter__(self):
-        '''
+        """
        Returns websocket url to chrome window with about:blank loaded.
-        '''
+        """
        return self.start()

    def __exit__(self, *args):
        self.stop()

    def _init_cookie_db(self, cookie_db):
-        cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
-        cookie_location = os.path.join(cookie_dir, 'Cookies')
-        self.logger.debug('cookie DB provided, writing to %s', cookie_location)
+        cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
+        cookie_location = os.path.join(cookie_dir, "Cookies")
+        self.logger.debug("cookie DB provided, writing to %s", cookie_location)
        os.makedirs(cookie_dir, exist_ok=True)

        try:
-            with open(cookie_location, 'wb') as cookie_file:
+            with open(cookie_location, "wb") as cookie_file:
                cookie_file.write(cookie_db)
        except OSError:
            self.logger.error(
-                    'exception writing cookie file at %s',
-                    cookie_location, exc_info=True)
+                "exception writing cookie file at %s", cookie_location, exc_info=True
+            )

    def persist_and_read_cookie_db(self):
-        cookie_location = os.path.join(
-                self._chrome_user_data_dir, 'Default', 'Cookies')
+        cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
        self.logger.debug(
-                'marking cookies persistent then reading file into memory: %s',
-                cookie_location)
+            "marking cookies persistent then reading file into memory: %s",
+            cookie_location,
+        )
        try:
            with sqlite3.connect(cookie_location) as conn:
                cur = conn.cursor()
-                cur.execute('UPDATE cookies SET is_persistent = 1')
+                cur.execute("UPDATE cookies SET is_persistent = 1")
        except sqlite3.Error:
            try:
                # db schema changed around version 66, this is the old schema
                with sqlite3.connect(cookie_location) as conn:
                    cur = conn.cursor()
-                    cur.execute('UPDATE cookies SET persistent = 1')
+                    cur.execute("UPDATE cookies SET persistent = 1")
            except sqlite3.Error:
                self.logger.error(
-                        'exception updating cookie DB %s', cookie_location,
-                        exc_info=True)
+                    "exception updating cookie DB %s", cookie_location, exc_info=True
+                )

        cookie_db = None
        try:
-            with open(cookie_location, 'rb') as cookie_file:
+            with open(cookie_location, "rb") as cookie_file:
                cookie_db = cookie_file.read()
        except OSError:
            self.logger.error(
-                    'exception reading from cookie DB file %s',
-                    cookie_location, exc_info=True)
+                "exception reading from cookie DB file %s",
+                cookie_location,
+                exc_info=True,
+            )
        return cookie_db

-    def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
-              disk_cache_size=None, websocket_timeout=60,
-              window_height=900, window_width=1400):
-        '''
+    def start(
+        self,
+        proxy=None,
+        cookie_db=None,
+        disk_cache_dir=None,
+        disk_cache_size=None,
+        websocket_timeout=60,
+        window_height=900,
+        window_width=1400,
+    ):
+        """
        Starts chrome/chromium process.

        Args:
@ -154,103 +167,126 @@ class Chrome:
            window_height, window_width: window height and width, in pixels
        Returns:
            websocket url to chrome window with about:blank loaded
-        '''
+        """
        # these can raise exceptions
        self._home_tmpdir = tempfile.TemporaryDirectory()
        self._chrome_user_data_dir = os.path.join(
-            self._home_tmpdir.name, 'chrome-user-data')
+            self._home_tmpdir.name, "chrome-user-data"
+        )
        if cookie_db:
            self._init_cookie_db(cookie_db)
        self._shutdown.clear()

        new_env = os.environ.copy()
-        new_env['HOME'] = self._home_tmpdir.name
+        new_env["HOME"] = self._home_tmpdir.name
        chrome_args = [
-                self.chrome_exe,
-                '-v',
-                '--headless',
-                '--remote-debugging-port=%s' % self.port,
-                '--use-mock-keychain', # mac thing
-                '--user-data-dir=%s' % self._chrome_user_data_dir,
-                '--disable-background-networking', '--disable-breakpad',
-                '--disable-renderer-backgrounding', '--disable-hang-monitor',
-                '--disable-background-timer-throttling', '--mute-audio',
-                '--disable-web-sockets',
-                f'--window-size={window_width},{window_height}',
-                '--no-default-browser-check',
-                '--disable-first-run-ui', '--no-first-run',
-                '--homepage=about:blank', '--disable-direct-npapi-requests',
-                '--disable-web-security', '--disable-notifications',
-                '--disable-extensions', '--disable-save-password-bubble',
-                '--disable-sync']
+            self.chrome_exe,
+            "-v",
+            "--headless",
+            "--remote-debugging-port=%s" % self.port,
+            "--use-mock-keychain",  # mac thing
+            "--user-data-dir=%s" % self._chrome_user_data_dir,
+            "--disable-background-networking",
+            "--disable-breakpad",
+            "--disable-renderer-backgrounding",
+            "--disable-hang-monitor",
+            "--disable-background-timer-throttling",
+            "--mute-audio",
+            "--disable-web-sockets",
+            f"--window-size={window_width},{window_height}",
+            "--no-default-browser-check",
+            "--disable-first-run-ui",
+            "--no-first-run",
+            "--homepage=about:blank",
+            "--disable-direct-npapi-requests",
+            "--disable-web-security",
+            "--disable-notifications",
+            "--disable-extensions",
+            "--disable-save-password-bubble",
+            "--disable-sync",
+        ]

-        extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
+        extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS")
        if extra_chrome_args:
            chrome_args.extend(extra_chrome_args.split())
        if disk_cache_dir:
-            chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
+            chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir)
        if disk_cache_size:
-            chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
+            chrome_args.append("--disk-cache-size=%s" % disk_cache_size)
        if self.ignore_cert_errors:
-            chrome_args.append('--ignore-certificate-errors')
+            chrome_args.append("--ignore-certificate-errors")
        if proxy:
-            chrome_args.append('--proxy-server=%s' % proxy)
-        chrome_args.append('about:blank')
-        self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
+            chrome_args.append("--proxy-server=%s" % proxy)
+        chrome_args.append("about:blank")
+        self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
        # start_new_session - new process group so we can kill the whole group
        self.chrome_process = subprocess.Popen(
-                chrome_args, env=new_env, start_new_session=True,
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
+            chrome_args,
+            env=new_env,
+            start_new_session=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            bufsize=0,
+        )
        self._out_reader_thread = threading.Thread(
-                target=self._read_stderr_stdout,
-                name='ChromeOutReaderThread:%s' % self.port, daemon=True)
+            target=self._read_stderr_stdout,
+            name="ChromeOutReaderThread:%s" % self.port,
+            daemon=True,
+        )
        self._out_reader_thread.start()
-        self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
+        self.logger.info("chrome running, pid %s" % self.chrome_process.pid)

        return self._websocket_url(timeout_sec=websocket_timeout)

-    def _websocket_url(self, timeout_sec = 60):
-        json_url = 'http://localhost:%s/json' % self.port
+    def _websocket_url(self, timeout_sec=60):
+        json_url = "http://localhost:%s/json" % self.port
        # make this a member variable so that kill -QUIT reports it
        self._start = time.time()
        self._last_warning = self._start
        while True:
            try:
                raw_json = urllib.request.urlopen(json_url, timeout=30).read()
-                all_debug_info = json.loads(raw_json.decode('utf-8'))
-                debug_info = [x for x in all_debug_info
-                              if x['url'] == 'about:blank']
+                all_debug_info = json.loads(raw_json.decode("utf-8"))
+                debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]

-                if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
-                    self.logger.debug('%s returned %s', json_url, raw_json)
-                    url = debug_info[0]['webSocketDebuggerUrl']
+                if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
+                    self.logger.debug("%s returned %s", json_url, raw_json)
+                    url = debug_info[0]["webSocketDebuggerUrl"]
                    self.logger.info(
-                            'got chrome window websocket debug url %s from %s',
-                            url, json_url)
+                        "got chrome window websocket debug url %s from %s",
+                        url,
+                        json_url,
+                    )
                    return url
            except brozzler.ShutdownRequested:
                raise
            except Exception as e:
                if time.time() - self._last_warning > 30:
                    self.logger.warning(
-                            'problem with %s (will keep trying until timeout '
-                            'of %d seconds): %s', json_url, timeout_sec, e)
+                        "problem with %s (will keep trying until timeout "
+                        "of %d seconds): %s",
+                        json_url,
+                        timeout_sec,
+                        e,
+                    )
                    self._last_warning = time.time()
            finally:
                e = None
                if self.chrome_process:
                    if time.time() - self._start > timeout_sec:
                        e = Exception(
-                                'killing chrome, failed to retrieve %s after '
-                                '%s seconds' % (
-                                    json_url, time.time() - self._start))
+                            "killing chrome, failed to retrieve %s after "
+                            "%s seconds" % (json_url, time.time() - self._start)
+                        )
                    elif self.chrome_process.poll() is not None:
                        e = Exception(
-                                'chrome process died with status %s' % self.chrome_process.poll())
+                            "chrome process died with status %s"
+                            % self.chrome_process.poll()
+                        )
                    else:
                        time.sleep(0.5)
                else:
-                    e = Exception('??? self.chrome_process is not set ???')
+                    e = Exception("??? self.chrome_process is not set ???")
                if e:
                    self.stop()
                    raise e
@ -258,11 +294,13 @@ class Chrome:
    def _read_stderr_stdout(self):
        # XXX select doesn't work on windows
        def readline_nonblock(f):
-            buf = b''
+            buf = b""
            try:
-                while not self._shutdown.is_set() and (
-                    len(buf) == 0 or buf[-1] != 0xa) and select.select(
-                            [f],[],[],0.5)[0]:
+                while (
+                    not self._shutdown.is_set()
+                    and (len(buf) == 0 or buf[-1] != 0xA)
+                    and select.select([f], [], [], 0.5)[0]
+                ):
                    buf += f.read(1)
            except (ValueError, OSError):
                # When the chrome process crashes, stdout & stderr are closed
@ -276,16 +314,16 @@ class Chrome:
                buf = readline_nonblock(self.chrome_process.stdout)
                if buf:
                    self.logger.trace(
-                            'chrome pid %s STDOUT %s',
-                            self.chrome_process.pid, buf)
+                        "chrome pid %s STDOUT %s", self.chrome_process.pid, buf
+                    )

                buf = readline_nonblock(self.chrome_process.stderr)
                if buf:
                    self.logger.trace(
-                            'chrome pid %s STDERR %s',
-                            self.chrome_process.pid, buf)
+                        "chrome pid %s STDERR %s", self.chrome_process.pid, buf
+                    )
        except:
-            self.logger.error('unexpected exception', exc_info=True)
+            self.logger.error("unexpected exception", exc_info=True)

    def stop(self):
        if not self.chrome_process or self._shutdown.is_set():
@ -294,8 +332,7 @@ class Chrome:

        timeout_sec = 300
        if self.chrome_process.poll() is None:
-            self.logger.info(
-                    'terminating chrome pgid %s', self.chrome_process.pid)
+            self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)

            os.killpg(self.chrome_process.pid, signal.SIGTERM)
        t0 = time.time()
@ -306,12 +343,14 @@ class Chrome:
                if status is not None:
                    if status == 0:
                        self.logger.info(
-                                'chrome pid %s exited normally',
-                                self.chrome_process.pid)
+                            "chrome pid %s exited normally", self.chrome_process.pid
+                        )
                    else:
                        self.logger.warning(
-                                'chrome pid %s exited with nonzero status %s',
-                                self.chrome_process.pid, status)
+                            "chrome pid %s exited with nonzero status %s",
+                            self.chrome_process.pid,
+                            status,
+                        )

                    # XXX I would like to forcefully kill the process group
                    # here to guarantee no orphaned chromium subprocesses hang
@ -321,14 +360,18 @@ class Chrome:
                time.sleep(0.5)

            self.logger.warning(
-                    'chrome pid %s still alive %.1f seconds after sending '
-                    'SIGTERM, sending SIGKILL', self.chrome_process.pid,
-                    time.time() - t0)
+                "chrome pid %s still alive %.1f seconds after sending "
+                "SIGTERM, sending SIGKILL",
+                self.chrome_process.pid,
+                time.time() - t0,
+            )
            os.killpg(self.chrome_process.pid, signal.SIGKILL)
            status = self.chrome_process.wait()
            self.logger.warning(
-                    'chrome pid %s reaped (status=%s) after killing with '
-                    'SIGKILL', self.chrome_process.pid, status)
+                "chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
+                self.chrome_process.pid,
+                status,
+            )

        finally:
            self.chrome_process.stdout.close()
@ -337,8 +380,7 @@ class Chrome:
                self._home_tmpdir.cleanup()
            except:
                self.logger.error(
-                        'exception deleting %s', self._home_tmpdir,
-                        exc_info=True)
+                    "exception deleting %s", self._home_tmpdir, exc_info=True
+                )
            self._out_reader_thread.join()
            self.chrome_process = None
-
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
 endspoints etc

@ -15,17 +15,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import logging
 import sys
+
 try:
    import flask
 except ImportError as e:
    logging.critical(
-            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[dashboard]".\nSee README.rst for more information.',
-            type(e).__name__, e)
+        '%s: %s\n\nYou might need to run "pip install '
+        'brozzler[dashboard]".\nSee README.rst for more information.',
+        type(e).__name__,
+        e,
+    )
    sys.exit(1)
 import doublethink
 import json
@ -41,33 +44,44 @@ app = flask.Flask(__name__)

 # configure with environment variables
 SETTINGS = {
-    'RETHINKDB_SERVERS': os.environ.get(
-        'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
-    'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
-    'WAYBACK_BASEURL': os.environ.get(
-        'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
-    'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'),
-    'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost')
+    "RETHINKDB_SERVERS": os.environ.get(
+        "BROZZLER_RETHINKDB_SERVERS", "localhost"
+    ).split(","),
+    "RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
+    "WAYBACK_BASEURL": os.environ.get(
+        "WAYBACK_BASEURL", "http://localhost:8880/brozzler"
+    ),
+    "DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
+    "DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
 }
-rr = doublethink.Rethinker(
-        SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
+rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
 _svc_reg = None
+
+
 def service_registry():
    global _svc_reg
    if not _svc_reg:
        _svc_reg = doublethink.ServiceRegistry(rr)
    return _svc_reg

+
@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
 def queued_count(site_id):
-    reql = rr.table("pages").between(
-            [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
-            index="priority_by_site").count()
+    reql = (
+        rr.table("pages")
+        .between(
+            [site_id, 0, False, r.minval],
+            [site_id, 0, False, r.maxval],
+            index="priority_by_site",
+        )
+        .count()
+    )
    logging.debug("querying rethinkdb: %s", reql)
    count = reql.run()
    return flask.jsonify(count=count)

+
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
 def queue(site_id):
@ -75,38 +89,52 @@ def queue(site_id):
    start = flask.request.args.get("start", 0)
    end = flask.request.args.get("end", start + 90)
    reql = rr.table("pages").between(
-            [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
-            index="priority_by_site")[start:end]
+        [site_id, 0, False, r.minval],
+        [site_id, 0, False, r.maxval],
+        index="priority_by_site",
+    )[start:end]
    logging.debug("querying rethinkdb: %s", reql)
    queue_ = reql.run()
    return flask.jsonify(queue_=list(queue_))

+
@app.route("/api/sites/<site_id>/pages_count")
@app.route("/api/site/<site_id>/pages_count")
@app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count")
 def page_count(site_id):
-    reql = rr.table("pages").between(
+    reql = (
+        rr.table("pages")
+        .between(
            [site_id, 1, False, r.minval],
            [site_id, r.maxval, False, r.maxval],
-            index="priority_by_site").count()
+            index="priority_by_site",
+        )
+        .count()
+    )
    logging.debug("querying rethinkdb: %s", reql)
    count = reql.run()
    return flask.jsonify(count=count)

+
@app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages")
 def pages(site_id):
    """Pages already crawled."""
    start = int(flask.request.args.get("start", 0))
    end = int(flask.request.args.get("end", start + 90))
-    reql = rr.table("pages").between(
-            [site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
-            index="least_hops").order_by(index="least_hops")[start:end]
+    reql = (
+        rr.table("pages")
+        .between(
+            [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
+        )
+        .order_by(index="least_hops")[start:end]
+    )
    logging.debug("querying rethinkdb: %s", reql)
    pages_ = reql.run()
    return flask.jsonify(pages=list(pages_))

+
@app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>")
 def page(page_id):
@ -115,6 +143,7 @@ def page(page_id):
    page_ = reql.run()
    return flask.jsonify(page_)

+
@app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml")
 def page_yaml(page_id):
@ -122,8 +151,9 @@ def page_yaml(page_id):
    logging.debug("querying rethinkdb: %s", reql)
    page_ = reql.run()
    return app.response_class(
-            yaml.dump(page_, default_flow_style=False),
-            mimetype="application/yaml")
+        yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
+    )
+

@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
@ -135,6 +165,7 @@ def site(site_id):
        s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
    return flask.jsonify(s)

+
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
 def site_yaml(site_id):
@ -142,8 +173,9 @@ def site_yaml(site_id):
    logging.debug("querying rethinkdb: %s", reql)
    site_ = reql.run()
    return app.response_class(
-            yaml.dump(site_, default_flow_style=False),
-            mimetype="application/yaml")
+        yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
+    )
+

@app.route("/api/stats/<bucket>")
 def stats(bucket):
@ -152,6 +184,7 @@ def stats(bucket):
    stats_ = reql.run()
    return flask.jsonify(stats_)

+
@app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites")
 def sites(job_id):
@ -168,6 +201,7 @@ def sites(job_id):
            s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
    return flask.jsonify(sites=sites_)

+
@app.route("/api/jobless-sites")
 def jobless_sites():
    # XXX inefficient (unindexed) query
@ -180,6 +214,7 @@ def jobless_sites():
            s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
    return flask.jsonify(sites=sites_)

+
@app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>")
 def job(job_id):
@ -192,6 +227,7 @@ def job(job_id):
    job_ = reql.run()
    return flask.jsonify(job_)

+
@app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml")
 def job_yaml(job_id):
@ -203,19 +239,22 @@ def job_yaml(job_id):
    logging.debug("querying rethinkdb: %s", reql)
    job_ = reql.run()
    return app.response_class(
-            yaml.dump(job_, default_flow_style=False),
-            mimetype="application/yaml")
+        yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
+    )
+

@app.route("/api/workers")
 def workers():
    workers_ = service_registry().available_services("brozzler-worker")
    return flask.jsonify(workers=list(workers_))

+
@app.route("/api/services")
 def services():
    services_ = service_registry().available_services()
    return flask.jsonify(services=list(services_))

+
@app.route("/api/jobs")
 def jobs():
    reql = rr.table("jobs").order_by(r.desc("id"))
@ -223,20 +262,24 @@ def jobs():
    jobs_ = list(reql.run())
    return flask.jsonify(jobs=jobs_)

+
@app.route("/api/config")
 def config():
    return flask.jsonify(config=SETTINGS)

+
@app.route("/api/<path:path>")
-@app.route("/api", defaults={"path":""})
+@app.route("/api", defaults={"path": ""})
 def api404(path):
    flask.abort(404)

+
@app.route("/", defaults={"path": ""})
@app.route("/<path:path>")
 def root(path):
    return flask.render_template("index.html")

+
 try:
    import gunicorn.app.base
    from gunicorn.six import iteritems
@ -255,8 +298,12 @@ try:

        def load_config(self):
            config = dict(
-                    [(key, value) for key, value in iteritems(self.options)
-                        if key in self.cfg.settings and value is not None])
+                [
+                    (key, value)
+                    for key, value in iteritems(self.options)
+                    if key in self.cfg.settings and value is not None
+                ]
+            )
            for key, value in iteritems(config):
                self.cfg.set(key.lower(), value)
            self.cfg.set("logger_class", BypassGunicornLogging)
@ -270,37 +317,42 @@ try:
        GunicornBrozzlerDashboard(app, options).run()

 except ImportError:
+
    def run():
        logging.info("running brozzler-dashboard using simple flask app.run")
-        app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT'])
+        app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
+

 def main(argv=None):
    import argparse
    import brozzler.cli
+
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
-            prog=os.path.basename(argv[0]),
-            formatter_class=argparse.RawDescriptionHelpFormatter,
-            description=(
-                'brozzler-dashboard - web application for viewing brozzler '
-                'crawl status'),
-            epilog=(
-                'brozzler-dashboard has no command line options, but can be '
-                'configured using the following environment variables:\n\n'
-                '  BROZZLER_RETHINKDB_SERVERS   rethinkdb servers, e.g. '
-                'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
-                'localhost)\n'
-                '  BROZZLER_RETHINKDB_DB        rethinkdb database name '
-                '(default: brozzler)\n'
-                '  WAYBACK_BASEURL     base url for constructing wayback '
-                'links (default http://localhost:8880/brozzler)'
-                '  DASHBOARD_PORT   brozzler-dashboard listening port (default: 8000)\n'
-                '  DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)'))
+        prog=os.path.basename(argv[0]),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=(
+            "brozzler-dashboard - web application for viewing brozzler " "crawl status"
+        ),
+        epilog=(
+            "brozzler-dashboard has no command line options, but can be "
+            "configured using the following environment variables:\n\n"
+            "  BROZZLER_RETHINKDB_SERVERS   rethinkdb servers, e.g. "
+            "db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
+            "localhost)\n"
+            "  BROZZLER_RETHINKDB_DB        rethinkdb database name "
+            "(default: brozzler)\n"
+            "  WAYBACK_BASEURL     base url for constructing wayback "
+            "links (default http://localhost:8880/brozzler)"
+            "  DASHBOARD_PORT   brozzler-dashboard listening port (default: 8000)\n"
+            "  DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
+        ),
+    )
    brozzler.cli.add_common_options(arg_parser, argv)
    args = arg_parser.parse_args(args=argv[1:])
    brozzler.cli.configure_logging(args)
    run()

+
 if __name__ == "__main__":
    main()
-
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
 working together in a single process

@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import sys
 import logging
+
 try:
    import warcprox
    import warcprox.main
@ -30,9 +31,11 @@ try:
    import brozzler.dashboard
 except ImportError as e:
    logging.critical(
-            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee README.rst for more information.',
-            type(e).__name__, e)
+        '%s: %s\n\nYou might need to run "pip install '
+        'brozzler[easy]".\nSee README.rst for more information.',
+        type(e).__name__,
+        e,
+    )
    sys.exit(1)
 import argparse
 import brozzler
@ -46,76 +49,112 @@ import doublethink
 import traceback
 import socketserver

+
 def _build_arg_parser(argv=None):
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
-            formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
-            prog=os.path.basename(argv[0]), description=(
-                'brozzler-easy - easy deployment of brozzler, with '
-                'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
-                'running in a single process'))
+        formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
+        prog=os.path.basename(argv[0]),
+        description=(
+            "brozzler-easy - easy deployment of brozzler, with "
+            "brozzler-worker, warcprox, pywb, and brozzler-dashboard all "
+            "running in a single process"
+        ),
+    )

    # common args
    brozzler.cli.add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
-            '-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
-            help='where to write warcs')
+        "-d",
+        "--warcs-dir",
+        dest="warcs_dir",
+        default="./warcs",
+        help="where to write warcs",
+    )

    # warcprox args
    arg_parser.add_argument(
-            '-c', '--cacert', dest='cacert',
-            default='./%s-warcprox-ca.pem' % socket.gethostname(),
-            help=(
-                'warcprox CA certificate file; if file does not exist, it '
-                'will be created'))
+        "-c",
+        "--cacert",
+        dest="cacert",
+        default="./%s-warcprox-ca.pem" % socket.gethostname(),
+        help=(
+            "warcprox CA certificate file; if file does not exist, it "
+            "will be created"
+        ),
+    )
    arg_parser.add_argument(
-            '--certs-dir', dest='certs_dir',
-            default='./%s-warcprox-ca' % socket.gethostname(),
-            help='where warcprox will store and load generated certificates')
+        "--certs-dir",
+        dest="certs_dir",
+        default="./%s-warcprox-ca" % socket.gethostname(),
+        help="where warcprox will store and load generated certificates",
+    )
    arg_parser.add_argument(
-            '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
-            default=None, help=(
-                'host:port of tor socks proxy, used only to connect to '
-                '.onion sites'))
+        "--onion-tor-socks-proxy",
+        dest="onion_tor_socks_proxy",
+        default=None,
+        help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
+    )

    # brozzler-worker args
    arg_parser.add_argument(
-            '-e', '--chrome-exe', dest='chrome_exe',
-            default=brozzler.cli.suggest_default_chrome_exe(),
-            help='executable to use to invoke chrome')
+        "-e",
+        "--chrome-exe",
+        dest="chrome_exe",
+        default=brozzler.cli.suggest_default_chrome_exe(),
+        help="executable to use to invoke chrome",
+    )
    arg_parser.add_argument(
-            '-n', '--max-browsers', dest='max_browsers',
-            type=int, default=1, help=(
-                'max number of chrome instances simultaneously '
-                'browsing pages'))
+        "-n",
+        "--max-browsers",
+        dest="max_browsers",
+        type=int,
+        default=1,
+        help=("max number of chrome instances simultaneously " "browsing pages"),
+    )

    # pywb args
    arg_parser.add_argument(
-            '--pywb-address', dest='pywb_address',
-            default='0.0.0.0',
-            help='pywb wayback address to listen on')
+        "--pywb-address",
+        dest="pywb_address",
+        default="0.0.0.0",
+        help="pywb wayback address to listen on",
+    )
    arg_parser.add_argument(
-            '--pywb-port', dest='pywb_port', type=int,
-            default=8880, help='pywb wayback port')
+        "--pywb-port",
+        dest="pywb_port",
+        type=int,
+        default=8880,
+        help="pywb wayback port",
+    )

    # dashboard args
    arg_parser.add_argument(
-            '--dashboard-address', dest='dashboard_address',
-            default='localhost',
-            help='brozzler dashboard address to listen on')
+        "--dashboard-address",
+        dest="dashboard_address",
+        default="localhost",
+        help="brozzler dashboard address to listen on",
+    )
    arg_parser.add_argument(
-            '--dashboard-port', dest='dashboard_port',
-            type=int, default=8881, help='brozzler dashboard port')
+        "--dashboard-port",
+        dest="dashboard_port",
+        type=int,
+        default=8881,
+        help="brozzler dashboard port",
+    )

    # common at the bottom args
    brozzler.cli.add_common_options(arg_parser, argv)

    return arg_parser

+
 class ThreadingWSGIServer(
-        socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer):
+    socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer
+):
    pass

+
 class BrozzlerEasyController:
    logger = logging.getLogger(__module__ + "." + __qualname__)

@ -123,25 +162,31 @@ class BrozzlerEasyController:
        self.stop = threading.Event()
        self.args = args
        self.warcprox_controller = warcprox.controller.WarcproxController(
-                self._warcprox_opts(args))
+            self._warcprox_opts(args)
+        )
        self.brozzler_worker = self._init_brozzler_worker(args)
        self.pywb_httpd = self._init_pywb(args)
        self.dashboard_httpd = self._init_brozzler_dashboard(args)

    def _init_brozzler_dashboard(self, args):
        return wsgiref.simple_server.make_server(
-                args.dashboard_address, args.dashboard_port,
-                brozzler.dashboard.app, ThreadingWSGIServer)
+            args.dashboard_address,
+            args.dashboard_port,
+            brozzler.dashboard.app,
+            ThreadingWSGIServer,
+        )

    def _init_brozzler_worker(self, args):
-        rr = doublethink.Rethinker(
-                args.rethinkdb_servers.split(","), args.rethinkdb_db)
+        rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
        frontier = brozzler.RethinkDbFrontier(rr)
        service_registry = doublethink.ServiceRegistry(rr)
        worker = brozzler.worker.BrozzlerWorker(
-                frontier, service_registry, chrome_exe=args.chrome_exe,
-                proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
-                max_browsers=args.max_browsers)
+            frontier,
+            service_registry,
+            chrome_exe=args.chrome_exe,
+            proxy="%s:%s" % self.warcprox_controller.proxy.server_address,
+            max_browsers=args.max_browsers,
+        )
        return worker

    def _init_pywb(self, args):
@ -152,66 +197,67 @@ class BrozzlerEasyController:
        brozzler.pywb.monkey_patch_fuzzy_query()
        brozzler.pywb.monkey_patch_calc_search_range()

-        if args.warcs_dir.endswith('/'):
+        if args.warcs_dir.endswith("/"):
            warcs_dir = args.warcs_dir
        else:
-            warcs_dir = args.warcs_dir + '/'
+            warcs_dir = args.warcs_dir + "/"

        conf = {
-            'collections': {
-                'brozzler': {
-                    'index_paths': brozzler.pywb.RethinkCDXSource(
+            "collections": {
+                "brozzler": {
+                    "index_paths": brozzler.pywb.RethinkCDXSource(
                        servers=args.rethinkdb_servers.split(","),
-                        db=args.rethinkdb_db, table='captures')
+                        db=args.rethinkdb_db,
+                        table="captures",
+                    )
                },
            },
            # 'enable_http_proxy': True,
            # 'enable_memento': True,
-            'archive_paths': warcs_dir,
-            'enable_cdx_api': True,
-            'framed_replay': True,
-            'port': args.pywb_port,
-            'enable_auto_colls': False,
+            "archive_paths": warcs_dir,
+            "enable_cdx_api": True,
+            "framed_replay": True,
+            "port": args.pywb_port,
+            "enable_auto_colls": False,
        }
        wsgi_app = pywb.framework.wsgi_wrappers.init_app(
-                pywb.webapp.pywb_init.create_wb_router, config=conf,
-                load_yaml=False)
+            pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False
+        )

        # disable is_hop_by_hop restrictions
        wsgiref.handlers.is_hop_by_hop = lambda x: False
        return wsgiref.simple_server.make_server(
-                args.pywb_address, args.pywb_port, wsgi_app,
-                ThreadingWSGIServer)
+            args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer
+        )

    def start(self):
-        self.logger.info('starting warcprox')
+        self.logger.info("starting warcprox")
        self.warcprox_controller.start()

        # XXX wait til fully started?
-        self.logger.info('starting brozzler-worker')
+        self.logger.info("starting brozzler-worker")
        self.brozzler_worker.start()

-        self.logger.info(
-                'starting pywb at %s:%s', *self.pywb_httpd.server_address)
+        self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
        threading.Thread(target=self.pywb_httpd.serve_forever).start()

        self.logger.info(
-                'starting brozzler-dashboard at %s:%s',
-                *self.dashboard_httpd.server_address)
+            "starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
+        )
        threading.Thread(target=self.dashboard_httpd.serve_forever).start()

    def shutdown(self):
-        self.logger.info('shutting down brozzler-dashboard')
+        self.logger.info("shutting down brozzler-dashboard")
        self.dashboard_httpd.shutdown()

-        self.logger.info('shutting down brozzler-worker')
+        self.logger.info("shutting down brozzler-worker")
        self.brozzler_worker.shutdown_now()
        # brozzler-worker is fully shut down at this point

-        self.logger.info('shutting down pywb')
+        self.logger.info("shutting down pywb")
        self.pywb_httpd.shutdown()

-        self.logger.info('shutting down warcprox')
+        self.logger.info("shutting down warcprox")
        self.warcprox_controller.shutdown()

    def wait_for_shutdown_request(self):
@ -222,14 +268,14 @@ class BrozzlerEasyController:
            self.shutdown()

    def _warcprox_opts(self, args):
-        '''
+        """
        Takes args as produced by the argument parser built by
        _build_arg_parser and builds warcprox arguments object suitable to pass
        to warcprox.main.init_controller. Copies some arguments, renames some,
        populates some with defaults appropriate for brozzler-easy, etc.
-        '''
+        """
        warcprox_opts = warcprox.Options()
-        warcprox_opts.address = 'localhost'
+        warcprox_opts.address = "localhost"
        # let the OS choose an available port; discover it later using
        # sock.getsockname()[1]
        warcprox_opts.port = 0
@ -237,17 +283,18 @@ class BrozzlerEasyController:
        warcprox_opts.certs_dir = args.certs_dir
        warcprox_opts.directory = args.warcs_dir
        warcprox_opts.gzip = True
-        warcprox_opts.prefix = 'brozzler'
-        warcprox_opts.size = 1000 * 1000* 1000
+        warcprox_opts.prefix = "brozzler"
+        warcprox_opts.size = 1000 * 1000 * 1000
        warcprox_opts.rollover_idle_time = 3 * 60
-        warcprox_opts.digest_algorithm = 'sha1'
+        warcprox_opts.digest_algorithm = "sha1"
        warcprox_opts.base32 = True
        warcprox_opts.stats_db_file = None
        warcprox_opts.playback_port = None
        warcprox_opts.playback_index_db_file = None
-        warcprox_opts.rethinkdb_big_table_url = (
-                'rethinkdb://%s/%s/captures' % (
-                    args.rethinkdb_servers, args.rethinkdb_db))
+        warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % (
+            args.rethinkdb_servers,
+            args.rethinkdb_db,
+        )
        warcprox_opts.queue_size = 500
        warcprox_opts.max_threads = None
        warcprox_opts.profile = False
@ -259,9 +306,11 @@ class BrozzlerEasyController:
        for th in threading.enumerate():
            state_strs.append(str(th))
            stack = traceback.format_stack(sys._current_frames()[th.ident])
-            state_strs.append(''.join(stack))
-        logging.warning('dumping state (caught signal {})\n{}'.format(
-            signum, '\n'.join(state_strs)))
+            state_strs.append("".join(stack))
+        logging.warning(
+            "dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
+        )
+

 def main(argv=None):
    argv = argv or sys.argv
@ -271,8 +320,8 @@ def main(argv=None):
    brozzler.chrome.check_version(args.chrome_exe)

    controller = BrozzlerEasyController(args)
-    signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
-    signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
+    signal.signal(signal.SIGTERM, lambda a, b: controller.stop.set())
+    signal.signal(signal.SIGINT, lambda a, b: controller.stop.set())
    signal.signal(signal.SIGQUIT, controller.dump_state)
    controller.start()
    controller.wait_for_shutdown_request()
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages

 Copyright (C) 2014-2018 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import logging
 import brozzler
@ -27,9 +27,11 @@ import urlcanon

 r = rdb.RethinkDB()

+
 class UnexpectedDbResult(Exception):
    pass

+
 class RethinkDbFrontier:
    logger = logging.getLogger(__module__ + "." + __qualname__)

@ -47,40 +49,49 @@ class RethinkDbFrontier:
        tables = self.rr.table_list().run()
        if not "sites" in tables:
            self.logger.info(
-                    "creating rethinkdb table 'sites' in database %r",
-                    self.rr.dbname)
+                "creating rethinkdb table 'sites' in database %r", self.rr.dbname
+            )
            self.rr.table_create(
-                    "sites", shards=self.shards, replicas=self.replicas).run()
-            self.rr.table("sites").index_create("sites_last_disclaimed", [
-                r.row["status"], r.row["last_disclaimed"]]).run()
+                "sites", shards=self.shards, replicas=self.replicas
+            ).run()
+            self.rr.table("sites").index_create(
+                "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
+            ).run()
            self.rr.table("sites").index_create("job_id").run()
        if not "pages" in tables:
            self.logger.info(
-                    "creating rethinkdb table 'pages' in database %r",
-                    self.rr.dbname)
+                "creating rethinkdb table 'pages' in database %r", self.rr.dbname
+            )
            self.rr.table_create(
-                    "pages", shards=self.shards, replicas=self.replicas).run()
-            self.rr.table("pages").index_create("priority_by_site", [
-                r.row["site_id"], r.row["brozzle_count"],
-                r.row["claimed"], r.row["priority"]]).run()
+                "pages", shards=self.shards, replicas=self.replicas
+            ).run()
+            self.rr.table("pages").index_create(
+                "priority_by_site",
+                [
+                    r.row["site_id"],
+                    r.row["brozzle_count"],
+                    r.row["claimed"],
+                    r.row["priority"],
+                ],
+            ).run()
            # this index is for displaying pages in a sensible order in the web
            # console
-            self.rr.table("pages").index_create("least_hops", [
-                r.row["site_id"], r.row["brozzle_count"],
-                r.row["hops_from_seed"]]).run()
+            self.rr.table("pages").index_create(
+                "least_hops",
+                [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
+            ).run()
        if not "jobs" in tables:
            self.logger.info(
-                    "creating rethinkdb table 'jobs' in database %r",
-                    self.rr.dbname)
+                "creating rethinkdb table 'jobs' in database %r", self.rr.dbname
+            )
            self.rr.table_create(
-                    "jobs", shards=self.shards, replicas=self.replicas).run()
+                "jobs", shards=self.shards, replicas=self.replicas
+            ).run()

    def _vet_result(self, result, **kwargs):
        # self.logger.debug("vetting expected=%s result=%s", kwargs, result)
        # {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
-        for k in [
-                "replaced", "errors", "skipped", "inserted", "deleted",
-                "unchanged"]:
+        for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]:
            if k in kwargs:
                expected = kwargs[k]
            else:
@ -88,81 +99,110 @@ class RethinkDbFrontier:
            if isinstance(expected, list):
                if result.get(k) not in kwargs[k]:
                    raise UnexpectedDbResult(
-                            "expected %r to be one of %r in %r" % (
-                                k, expected, result))
+                        "expected %r to be one of %r in %r" % (k, expected, result)
+                    )
            else:
                if result.get(k) != expected:
-                    raise UnexpectedDbResult("expected %r to be %r in %r" % (
-                        k, expected, result))
+                    raise UnexpectedDbResult(
+                        "expected %r to be %r in %r" % (k, expected, result)
+                    )

    def claim_sites(self, n=1):
-        self.logger.trace('claiming up to %s sites to brozzle', n)
+        self.logger.trace("claiming up to %s sites to brozzle", n)
        result = (
-            self.rr.table('sites').get_all(r.args(
-                r.db(self.rr.dbname).table('sites', read_mode='majority')
-                .between(
-                    ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
-                    index='sites_last_disclaimed')
-                .order_by(r.desc('claimed'), 'last_disclaimed')
-                .fold(
-                    {}, lambda acc, site: acc.merge(
-                        r.branch(
-                            site.has_fields('job_id'),
-                            r.object(
-                                site['job_id'].coerce_to('string'),
-                                acc[site['job_id'].coerce_to('string')].default(0).add(1)),
-                            {})),
-                    emit=lambda acc, site, new_acc: r.branch(
-                        r.and_(
-                            r.or_(
-                                site['claimed'].not_(),
-                                site['last_claimed'].lt(r.now().sub(60*60))),
-                            r.or_(
-                                site.has_fields('max_claimed_sites').not_(),
-                                new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
-                            [site['id']], []))
-                .limit(n)))
+            self.rr.table("sites")
+            .get_all(
+                r.args(
+                    r.db(self.rr.dbname)
+                    .table("sites", read_mode="majority")
+                    .between(
+                        ["ACTIVE", r.minval],
+                        ["ACTIVE", r.maxval],
+                        index="sites_last_disclaimed",
+                    )
+                    .order_by(r.desc("claimed"), "last_disclaimed")
+                    .fold(
+                        {},
+                        lambda acc, site: acc.merge(
+                            r.branch(
+                                site.has_fields("job_id"),
+                                r.object(
+                                    site["job_id"].coerce_to("string"),
+                                    acc[site["job_id"].coerce_to("string")]
+                                    .default(0)
+                                    .add(1),
+                                ),
+                                {},
+                            )
+                        ),
+                        emit=lambda acc, site, new_acc: r.branch(
+                            r.and_(
+                                r.or_(
+                                    site["claimed"].not_(),
+                                    site["last_claimed"].lt(r.now().sub(60 * 60)),
+                                ),
+                                r.or_(
+                                    site.has_fields("max_claimed_sites").not_(),
+                                    new_acc[site["job_id"].coerce_to("string")].le(
+                                        site["max_claimed_sites"]
+                                    ),
+                                ),
+                            ),
+                            [site["id"]],
+                            [],
+                        ),
+                    )
+                    .limit(n)
+                )
+            )
            .update(
                # try to avoid a race condition resulting in multiple
                # brozzler-workers claiming the same site
                # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                r.branch(
                    r.or_(
-                      r.row['claimed'].not_(),
-                      r.row['last_claimed'].lt(r.now().sub(60*60))),
-                    {'claimed': True, 'last_claimed': r.now()},
-                    {}),
-                return_changes=True)).run()
+                        r.row["claimed"].not_(),
+                        r.row["last_claimed"].lt(r.now().sub(60 * 60)),
+                    ),
+                    {"claimed": True, "last_claimed": r.now()},
+                    {},
+                ),
+                return_changes=True,
+            )
+        ).run()

        self._vet_result(
-                result, replaced=list(range(n+1)),
-                unchanged=list(range(n+1)))
+            result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))
+        )
        sites = []
        for i in range(result["replaced"]):
            if result["changes"][i]["old_val"]["claimed"]:
                self.logger.warning(
-                        "re-claimed site that was still marked 'claimed' "
-                        "because it was last claimed a long time ago "
-                        "at %s, and presumably some error stopped it from "
-                        "being disclaimed",
-                        result["changes"][i]["old_val"]["last_claimed"])
+                    "re-claimed site that was still marked 'claimed' "
+                    "because it was last claimed a long time ago "
+                    "at %s, and presumably some error stopped it from "
+                    "being disclaimed",
+                    result["changes"][i]["old_val"]["last_claimed"],
+                )
            site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
            sites.append(site)
-        self.logger.debug('claimed %s sites', len(sites))
+        self.logger.debug("claimed %s sites", len(sites))
        if sites:
            return sites
        else:
            raise brozzler.NothingToClaim

    def enforce_time_limit(self, site):
-        '''
+        """
        Raises `brozzler.ReachedTimeLimit` if appropriate.
-        '''
-        if (site.time_limit and site.time_limit > 0
-                and site.elapsed() > site.time_limit):
+        """
+        if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
            self.logger.debug(
-                    "site FINISHED_TIME_LIMIT! time_limit=%s "
-                    "elapsed=%s %s", site.time_limit, site.elapsed(), site)
+                "site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
+                site.time_limit,
+                site.elapsed(),
+                site,
+            )
            raise brozzler.ReachedTimeLimit

    def claim_page(self, site, worker_id):
@ -170,26 +210,37 @@ class RethinkDbFrontier:
        # brozzler-worker can be working on a site at a time, and that would
        # have to be the worker calling this method, so if something is claimed
        # already, it must have been left that way because of some error
-        result = self.rr.table("pages").between(
+        result = (
+            self.rr.table("pages")
+            .between(
                [site.id, 0, r.minval, r.minval],
                [site.id, 0, r.maxval, r.maxval],
-                index="priority_by_site").order_by(
-                        index=r.desc("priority_by_site")).limit(
-                                1).update({
-                                    "claimed":True,
-                                    "last_claimed_by":worker_id},
-                                    return_changes="always").run()
-        self._vet_result(result, unchanged=[0,1], replaced=[0,1])
+                index="priority_by_site",
+            )
+            .order_by(index=r.desc("priority_by_site"))
+            .limit(1)
+            .update(
+                {"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
+            )
+            .run()
+        )
+        self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
        if result["unchanged"] == 0 and result["replaced"] == 0:
            raise brozzler.NothingToClaim
        else:
            return brozzler.Page(self.rr, result["changes"][0]["new_val"])

    def has_outstanding_pages(self, site):
-        results_iter = self.rr.table("pages").between(
+        results_iter = (
+            self.rr.table("pages")
+            .between(
                [site.id, 0, r.minval, r.minval],
                [site.id, 0, r.maxval, r.maxval],
-                index="priority_by_site").limit(1).run()
+                index="priority_by_site",
+            )
+            .limit(1)
+            .run()
+        )
        return len(list(results_iter)) > 0

    def completed_page(self, site, page):
@ -202,22 +253,24 @@ class RethinkDbFrontier:
            site.save()

    def active_jobs(self):
-        results = self.rr.table("jobs").filter({"status":"ACTIVE"}).run()
+        results = self.rr.table("jobs").filter({"status": "ACTIVE"}).run()
        for result in results:
            yield brozzler.Job(self.rr, result)

    def honor_stop_request(self, site):
        """Raises brozzler.CrawlStopped if stop has been requested."""
        site.refresh()
-        if (site.stop_requested
-                and site.stop_requested <= doublethink.utcnow()):
+        if site.stop_requested and site.stop_requested <= doublethink.utcnow():
            self.logger.info("stop requested for site %s", site.id)
            raise brozzler.CrawlStopped

        if site.job_id:
            job = brozzler.Job.load(self.rr, site.job_id)
-            if (job and job.stop_requested
-                    and job.stop_requested <= doublethink.utcnow()):
+            if (
+                job
+                and job.stop_requested
+                and job.stop_requested <= doublethink.utcnow()
+            ):
                self.logger.info("stop requested for job %s", site.job_id)
                raise brozzler.CrawlStopped

@ -239,8 +292,7 @@ class RethinkDbFrontier:
                return False
            n += 1

-        self.logger.info(
-                "all %s sites finished, job %s is FINISHED!", n, job.id)
+        self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
        job.finish()
        job.save()
        return True
@ -270,13 +322,11 @@ class RethinkDbFrontier:
    def resume_job(self, job):
        job.status = "ACTIVE"
        job.stop_requested = None
-        job.starts_and_stops.append(
-                {"start":doublethink.utcnow(), "stop":None})
+        job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
        job.save()
        for site in self.job_sites(job.id):
            site.status = "ACTIVE"
-            site.starts_and_stops.append(
-                    {"start":doublethink.utcnow(), "stop":None})
+            site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
            site.save()

    def resume_site(self, site):
@ -285,51 +335,55 @@ class RethinkDbFrontier:
            job = brozzler.Job.load(self.rr, site.job_id)
            job.status = "ACTIVE"
            site.stop_requested = None
-            job.starts_and_stops.append(
-                    {"start":doublethink.utcnow(), "stop":None})
+            job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
            job.save()
        site.status = "ACTIVE"
-        site.starts_and_stops.append(
-                {"start":doublethink.utcnow(), "stop":None})
+        site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
        site.save()

    def _build_fresh_page(self, site, parent_page, url, hops_off=0):
        url_for_scoping = urlcanon.semantic(url)
        url_for_crawling = urlcanon.whatwg(url)
-        hashtag = (url_for_crawling.hash_sign
-                   + url_for_crawling.fragment).decode('utf-8')
+        hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
+            "utf-8"
+        )
        urlcanon.canon.remove_fragment(url_for_crawling)
-        page = brozzler.Page(self.rr, {
-            'url': str(url_for_crawling),
-            'site_id': site.id,
-            'job_id': site.job_id,
-            'hops_from_seed': parent_page.hops_from_seed + 1,
-            'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
-            'via_page_id': parent_page.id,
-            'via_page_url': parent_page.url,
-            'hops_off_surt': hops_off,
-            'hashtags': [hashtag] if hashtag else []})
+        page = brozzler.Page(
+            self.rr,
+            {
+                "url": str(url_for_crawling),
+                "site_id": site.id,
+                "job_id": site.job_id,
+                "hops_from_seed": parent_page.hops_from_seed + 1,
+                "hop_path": str(parent_page.hop_path if parent_page.hop_path else "")
+                + "L",
+                "via_page_id": parent_page.id,
+                "via_page_url": parent_page.url,
+                "hops_off_surt": hops_off,
+                "hashtags": [hashtag] if hashtag else [],
+            },
+        )
        return page

    def _merge_page(self, existing_page, fresh_page):
-        '''
+        """
        Utility method for merging info from `brozzler.Page` instances
        representing the same url but with possibly different metadata.
-        '''
+        """
        existing_page.priority += fresh_page.priority
-        existing_page.hashtags = list(set(
-            (existing_page.hashtags or []) + (fresh_page.hashtags or [])))
-        existing_page.hops_off = min(
-                existing_page.hops_off, fresh_page.hops_off)
+        existing_page.hashtags = list(
+            set((existing_page.hashtags or []) + (fresh_page.hashtags or []))
+        )
+        existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off)

    def _scope_and_enforce_robots(self, site, parent_page, outlinks):
-        '''
+        """
        Returns tuple (
            dict of {page_id: Page} of fresh `brozzler.Page` representing in
                scope links accepted by robots policy,
            set of in scope urls (canonicalized) blocked by robots policy,
            set of out-of-scope urls (canonicalized)).
-        '''
+        """
        pages = {}  # {page_id: Page, ...}
        blocked = set()
        out_of_scope = set()
@ -337,17 +391,18 @@ class RethinkDbFrontier:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            decision = site.accept_reject_or_neither(
-                    url_for_scoping, parent_page=parent_page)
+                url_for_scoping, parent_page=parent_page
+            )
            if decision is True:
                hops_off = 0
            elif decision is None:
-                decision = parent_page.hops_off < site.scope.get(
-                        'max_hops_off', 0)
+                decision = parent_page.hops_off < site.scope.get("max_hops_off", 0)
                hops_off = parent_page.hops_off + 1
            if decision is True:
                if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                    fresh_page = self._build_fresh_page(
-                            site, parent_page, url, hops_off)
+                        site, parent_page, url, hops_off
+                    )
                    if fresh_page.id in pages:
                        self._merge_page(pages[fresh_page.id], fresh_page)
                    else:
@ -359,31 +414,32 @@ class RethinkDbFrontier:
        return pages, blocked, out_of_scope

    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
-        decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
-        counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
+        decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
+        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}

        fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
-                site, parent_page, outlinks)
-        decisions['blocked'] = blocked
-        decisions['rejected'] = out_of_scope
-        counts['blocked'] += len(blocked)
-        counts['rejected'] += len(out_of_scope)
+            site, parent_page, outlinks
+        )
+        decisions["blocked"] = blocked
+        decisions["rejected"] = out_of_scope
+        counts["blocked"] += len(blocked)
+        counts["rejected"] += len(out_of_scope)

        # get existing pages from rethinkdb
-        results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
-        pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
+        results = self.rr.table("pages").get_all(*fresh_pages.keys()).run()
+        pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results}

        # build list of pages to save, consisting of new pages, and existing
        # pages updated with higher priority and new hashtags
        for fresh_page in fresh_pages.values():
-            decisions['accepted'].add(fresh_page.url)
+            decisions["accepted"].add(fresh_page.url)
            if fresh_page.id in pages:
                page = pages[fresh_page.id]
                self._merge_page(page, fresh_page)
-                counts['updated'] += 1
+                counts["updated"] += 1
            else:
                pages[fresh_page.id] = fresh_page
-                counts['added'] += 1
+                counts["added"] += 1

        # make sure we're not stepping on our own toes in case we have a link
        # back to parent_page, which I think happens because of hashtags
@ -396,19 +452,22 @@ class RethinkDbFrontier:
        # there can be many pages and each one can be very large (many videos,
        # in and out of scope links, etc)
        l = list(pages.values())
-        for batch in (l[i:i+50] for i in range(0, len(l), 50)):
+        for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
            try:
-                self.logger.debug(
-                        'inserting/replacing batch of %s pages', len(batch))
-                reql = self.rr.table('pages').insert(batch, conflict='replace')
+                self.logger.debug("inserting/replacing batch of %s pages", len(batch))
+                reql = self.rr.table("pages").insert(batch, conflict="replace")
                self.logger.trace(
-                        'running query self.rr.table("pages").insert(%r, '
-                        'conflict="replace")', batch)
+                    'running query self.rr.table("pages").insert(%r, '
+                    'conflict="replace")',
+                    batch,
+                )
                result = reql.run()
            except Exception as e:
                self.logger.error(
-                        'problem inserting/replacing batch of %s pages',
-                        len(batch), exc_info=True)
+                    "problem inserting/replacing batch of %s pages",
+                    len(batch),
+                    exc_info=True,
+                )

        parent_page.outlinks = {}
        for k in decisions:
@ -416,43 +475,56 @@ class RethinkDbFrontier:
        parent_page.save()

        self.logger.info(
-                '%s new links added, %s existing links updated, %s links '
-                'rejected, %s links blocked by robots from %s',
-                counts['added'], counts['updated'], counts['rejected'],
-                counts['blocked'], parent_page)
+            "%s new links added, %s existing links updated, %s links "
+            "rejected, %s links blocked by robots from %s",
+            counts["added"],
+            counts["updated"],
+            counts["rejected"],
+            counts["blocked"],
+            parent_page,
+        )

    def reached_limit(self, site, e):
        self.logger.info("reached_limit site=%s e=%s", site, e)
        assert isinstance(e, brozzler.ReachedLimit)
-        if (site.reached_limit
-                and site.reached_limit != e.warcprox_meta["reached-limit"]):
+        if (
+            site.reached_limit
+            and site.reached_limit != e.warcprox_meta["reached-limit"]
+        ):
            self.logger.warning(
-                    "reached limit %s but site had already reached limit %s",
-                    e.warcprox_meta["reached-limit"], self.reached_limit)
+                "reached limit %s but site had already reached limit %s",
+                e.warcprox_meta["reached-limit"],
+                self.reached_limit,
+            )
        else:
            site.reached_limit = e.warcprox_meta["reached-limit"]
            self.finished(site, "FINISHED_REACHED_LIMIT")

    def job_sites(self, job_id):
-        results = self.rr.table('sites').get_all(job_id, index="job_id").run()
+        results = self.rr.table("sites").get_all(job_id, index="job_id").run()
        for result in results:
            yield brozzler.Site(self.rr, result)

    def seed_page(self, site_id):
-        results = self.rr.table("pages").between(
+        results = (
+            self.rr.table("pages")
+            .between(
                [site_id, r.minval, r.minval, r.minval],
                [site_id, r.maxval, r.maxval, r.maxval],
-                index="priority_by_site").filter({"hops_from_seed":0}).run()
+                index="priority_by_site",
+            )
+            .filter({"hops_from_seed": 0})
+            .run()
+        )
        pages = list(results)
        if len(pages) > 1:
-            self.logger.warning(
-                    "more than one seed page for site_id %s ?", site_id)
+            self.logger.warning("more than one seed page for site_id %s ?", site_id)
        if len(pages) < 1:
            return None
        return brozzler.Page(self.rr, pages[0])

    def site_pages(self, site_id, brozzled=None):
-        '''
+        """
        Args:
            site_id (str or int):
            brozzled (bool): if true, results include only pages that have
@ -460,16 +532,14 @@ class RethinkDbFrontier:
                not been brozzled; and if None (the default), all pages
        Returns:
            iterator of brozzler.Page
-        '''
+        """
        query = self.rr.table("pages").between(
-                [site_id, 1 if brozzled is True else 0,
-                    r.minval, r.minval],
-                [site_id, 0 if brozzled is False else r.maxval,
-                    r.maxval, r.maxval],
-                index="priority_by_site")
+            [site_id, 1 if brozzled is True else 0, r.minval, r.minval],
+            [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
+            index="priority_by_site",
+        )
        self.logger.trace("running query: %r", query)
        results = query.run()
        for result in results:
            self.logger.trace("yielding result: %r", result)
            yield brozzler.Page(self.rr, result)
-
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/models.py - model classes representing jobs, sites, and pages, with
 related logic

@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import brozzler
 import base64
@ -36,15 +36,18 @@ import yaml
 import zlib
 from typing import Optional

+
 def load_schema():
-    schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
+    schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
    with open(schema_file) as f:
        return yaml.safe_load(f)

+
 class JobValidator(cerberus.Validator):
    def _validate_type_url(self, value):
        url = urllib.parse.urlparse(value)
-        return url.scheme in ('http', 'https', 'ftp')
+        return url.scheme in ("http", "https", "ftp")
+

 class InvalidJobConf(Exception):
    def __init__(self, validator):
@ -53,15 +56,17 @@ class InvalidJobConf(Exception):
            # Cerberus does a nice job hiding the bad value. In the case I
            # debugged, I found it here. Maybe there's a better way to see it.
            value = validator._errors[0].info[0][0].info[0][0].value
-            self.errors['bad value'] = value
+            self.errors["bad value"] = value
        except:
            value = None

+
 def validate_conf(job_conf, schema=load_schema()):
    v = JobValidator(schema)
    if not v.validate(job_conf, normalize=False):
        raise InvalidJobConf(v)

+
 def merge(a, b):
    if isinstance(a, dict) and isinstance(b, dict):
        merged = dict(a)
@ -75,19 +80,22 @@ def merge(a, b):
    else:
        return a

+
 def new_job_file(frontier, job_conf_file):
-    '''Returns new Job.'''
+    """Returns new Job."""
    logging.info("loading %s", job_conf_file)
    with open(job_conf_file) as f:
        job_conf = yaml.safe_load(f)
        return new_job(frontier, job_conf)

+
 def new_job(frontier, job_conf):
-    '''Returns new Job.'''
+    """Returns new Job."""
    validate_conf(job_conf)
-    job = Job(frontier.rr, {
-                "conf": job_conf, "status": "ACTIVE",
-                "started": doublethink.utcnow()})
+    job = Job(
+        frontier.rr,
+        {"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()},
+    )
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
@ -108,32 +116,40 @@ def new_job(frontier, job_conf):

    # insert in batches to avoid this error
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
-    for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
-        logging.info('inserting batch of %s pages', len(batch))
-        result = frontier.rr.table('pages').insert(batch).run()
-    for batch in (sites[i:i+100]  for i in range(0, len(sites), 100)):
-        logging.info('inserting batch of %s sites', len(batch))
-        result = frontier.rr.table('sites').insert(batch).run()
-    logging.info('job %s fully started', job.id)
+    for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
+        logging.info("inserting batch of %s pages", len(batch))
+        result = frontier.rr.table("pages").insert(batch).run()
+    for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
+        logging.info("inserting batch of %s sites", len(batch))
+        result = frontier.rr.table("sites").insert(batch).run()
+    logging.info("job %s fully started", job.id)

    return job

+
 def new_seed_page(frontier, site):
    url = urlcanon.parse_url(site.seed)
    hashtag = (url.hash_sign + url.fragment).decode("utf-8")
    urlcanon.canon.remove_fragment(url)
-    page = brozzler.Page(frontier.rr, {
-        "url": str(url),
-        "site_id": site.get("id"),
-        "job_id": site.get("job_id"),
-        "hops_from_seed": 0,
-        "priority": 1000,
-        "needs_robots_check": True,
-        "hop_path": None})
+    page = brozzler.Page(
+        frontier.rr,
+        {
+            "url": str(url),
+            "site_id": site.get("id"),
+            "job_id": site.get("job_id"),
+            "hops_from_seed": 0,
+            "priority": 1000,
+            "needs_robots_check": True,
+            "hop_path": None,
+        },
+    )
    if hashtag:
-        page.hashtags = [hashtag,]
+        page.hashtags = [
+            hashtag,
+        ]
    return page

+
 def new_site(frontier, site):
    logging.info("new site %s", site)
    site.id = site.id or str(uuid.uuid4())
@ -148,9 +164,10 @@ def new_site(frontier, site):
        # finally block because we want to insert the Site no matter what
        site.save()

+
 class ElapsedMixIn(object):
    def elapsed(self):
-        '''
+        """
        Returns elapsed crawl time as a float in seconds.

        This metric includes all the time that a site was in active rotation,
@ -158,21 +175,22 @@ class ElapsedMixIn(object):

        In contrast `Site.active_brozzling_time` only counts time when a
        brozzler worker claimed the site and was actively brozzling it.
-        '''
+        """
        dt = 0
        for ss in self.starts_and_stops[:-1]:
-            if ss['stop']:
-                dt += (ss['stop'] - ss['start']).total_seconds()
+            if ss["stop"]:
+                dt += (ss["stop"] - ss["start"]).total_seconds()
            else:
                self.logger.warning("missing expected ss['stop']")
-                dt += (doublethink.utcnow() - ss['start']).total_seconds()
+                dt += (doublethink.utcnow() - ss["start"]).total_seconds()
        ss = self.starts_and_stops[-1]
-        if ss['stop']:
-            dt += (ss['stop'] - ss['start']).total_seconds()
-        else: # crawl is active
-            dt += (doublethink.utcnow() - ss['start']).total_seconds()
+        if ss["stop"]:
+            dt += (ss["stop"] - ss["start"]).total_seconds()
+        else:  # crawl is active
+            dt += (doublethink.utcnow() - ss["start"]).total_seconds()
        return dt

+
 class Job(doublethink.Document, ElapsedMixIn):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    table = "jobs"
@ -181,29 +199,30 @@ class Job(doublethink.Document, ElapsedMixIn):
        if not "status" in self:
            self.status = "ACTIVE"
        if not "starts_and_stops" in self:
-            if self.get("started"):   # backward compatibility
-                self.starts_and_stops = [{
-                    "start": self.get("started"),
-                    "stop": self.get("finished")}]
+            if self.get("started"):  # backward compatibility
+                self.starts_and_stops = [
+                    {"start": self.get("started"), "stop": self.get("finished")}
+                ]
                del self["started"]
                if "finished" in self:
                    del self["finished"]
            else:
-                self.starts_and_stops = [
-                        {"start":doublethink.utcnow(),"stop":None}]
+                self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]

    def finish(self):
        if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
            self.logger.error(
-                    "job is already finished status=%s "
-                    "starts_and_stops[-1]['stop']=%s", self.status,
-                    self.starts_and_stops[-1]["stop"])
+                "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
+                self.status,
+                self.starts_and_stops[-1]["stop"],
+            )
        self.status = "FINISHED"
        self.starts_and_stops[-1]["stop"] = doublethink.utcnow()

+
 class Site(doublethink.Document, ElapsedMixIn):
    logger = logging.getLogger(__module__ + "." + __qualname__)
-    table = 'sites'
+    table = "sites"

    def populate_defaults(self):
        if not "status" in self:
@ -225,26 +244,26 @@ class Site(doublethink.Document, ElapsedMixIn):
            del self.scope["surt"]

        # backward compatibility
-        if ("max_hops_off_surt" in self.scope
-                and not "max_hops_off" in self.scope):
+        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]

        if self.seed:
            self._accept_ssurt_if_not_redundant(
-                    brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
+                brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
+            )

        if not "starts_and_stops" in self:
-            if self.get("start_time"):   # backward compatibility
-                self.starts_and_stops = [{
-                    "start":self.get("start_time"),"stop":None}]
+            if self.get("start_time"):  # backward compatibility
+                self.starts_and_stops = [
+                    {"start": self.get("start_time"), "stop": None}
+                ]
                if self.get("status") != "ACTIVE":
                    self.starts_and_stops[0]["stop"] = self.last_disclaimed
                del self["start_time"]
            else:
-                self.starts_and_stops = [
-                        {"start":doublethink.utcnow(),"stop":None}]
+                self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]

    def __str__(self):
        return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
@ -253,11 +272,12 @@ class Site(doublethink.Document, ElapsedMixIn):
        if not "accepts" in self.scope:
            self.scope["accepts"] = []
        simple_rule_ssurts = (
-            rule["ssurt"] for rule in self.scope["accepts"]
-            if set(rule.keys()) == {'ssurt'})
+            rule["ssurt"]
+            for rule in self.scope["accepts"]
+            if set(rule.keys()) == {"ssurt"}
+        )
        if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
-            self.logger.info(
-                    "adding ssurt %s to scope accept rules", ssurt)
+            self.logger.info("adding ssurt %s to scope accept rules", ssurt)
            self.scope["accepts"].append({"ssurt": ssurt})

    def note_seed_redirect(self, url):
@ -266,14 +286,14 @@ class Site(doublethink.Document, ElapsedMixIn):

        # if http://foo.com/ redirects to https://foo.com/a/b/c let's also
        # put all of https://foo.com/ in scope
-        if (canon_seed_redirect.authority == canon_seed.authority
-                and canon_seed_redirect.scheme != canon_seed.scheme):
+        if (
+            canon_seed_redirect.authority == canon_seed.authority
+            and canon_seed_redirect.scheme != canon_seed.scheme
+        ):
            canon_seed.scheme = canon_seed_redirect.scheme
-            self._accept_ssurt_if_not_redundant(
-                    canon_seed.ssurt().decode('ascii'))
+            self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii"))

-        self._accept_ssurt_if_not_redundant(
-                canon_seed_redirect.ssurt().decode('ascii'))
+        self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii"))

    def extra_headers(self, page: Optional["Page"] = None):
        hdrs = {}
@ -281,28 +301,34 @@ class Site(doublethink.Document, ElapsedMixIn):
            temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
            if "blocks" in self.warcprox_meta:
                # delete temp_warcprox_meta's 'blocks' (they may be big!)
-                del temp_warcprox_meta['blocks']
+                del temp_warcprox_meta["blocks"]
                # str-ify blocks
-                blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
+                blocks_str = json.dumps(
+                    self.warcprox_meta["blocks"], separators=(",", ":")
+                )
                # encode(), compress, b64encode, decode()
-                temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
+                temp_warcprox_meta["compressed_blocks"] = base64.b64encode(
+                    zlib.compress(blocks_str.encode())
+                ).decode()
            if page is not None:
                temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
                temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
                temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
-            hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
+            hdrs["Warcprox-Meta"] = json.dumps(
+                temp_warcprox_meta, separators=(",", ":")
+            )
        return hdrs

    def accept_reject_or_neither(self, url, parent_page=None):
-        '''
+        """
        Returns `True` (accepted), `False` (rejected), or `None` (no decision).

        `None` usually means rejected, unless `max_hops_off` comes into play.
-        '''
+        """
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)

-        if not url.scheme in (b'http', b'https'):
+        if not url.scheme in (b"http", b"https"):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
@ -311,12 +337,14 @@ class Site(doublethink.Document, ElapsedMixIn):
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
-                try_parent_urls.append(
-                        urlcanon.semantic(parent_page.redirect_url))
+                try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url))

        # enforce max_hops
-        if (parent_page and "max_hops" in self.scope
-                and parent_page.hops_from_seed >= self.scope["max_hops"]):
+        if (
+            parent_page
+            and "max_hops" in self.scope
+            and parent_page.hops_from_seed >= self.scope["max_hops"]
+        ):
            return False

        # enforce reject rules
@ -326,7 +354,7 @@ class Site(doublethink.Document, ElapsedMixIn):
                if try_parent_urls:
                    for parent_url in try_parent_urls:
                        if rule.applies(url, parent_url):
-                           return False
+                            return False
                else:
                    if rule.applies(url):
                        return False
@ -337,7 +365,7 @@ class Site(doublethink.Document, ElapsedMixIn):
            if try_parent_urls:
                for parent_url in try_parent_urls:
                    if rule.applies(url, parent_url):
-                       return True
+                        return True
            else:
                if rule.applies(url):
                    return True
@ -345,6 +373,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        # no decision if we reach here
        return None

+
 class Page(doublethink.Document):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    table = "pages"
@ -398,4 +427,3 @@ class Page(doublethink.Document):
        if self._canon_hurl is None:
            self._canon_hurl = urlcanon.semantic(self.url)
        return str(self._canon_hurl)
-
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
 loading from warcs still being written to, canonicalization rules matching
 brozzler conventions, support for screenshot: and thumbnail: urls
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import sys
 import logging
+
 try:
    import pywb.apps.cli
    import pywb.cdx.cdxdomainspecific
@ -30,9 +31,11 @@ try:
    import pywb.rewrite.wburl
 except ImportError as e:
    logging.critical(
-            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee README.rst for more information.',
-            type(e).__name__, e)
+        '%s: %s\n\nYou might need to run "pip install '
+        'brozzler[easy]".\nSee README.rst for more information.',
+        type(e).__name__,
+        e,
+    )
    sys.exit(1)
 import doublethink
 import rethinkdb as rdb
@ -43,6 +46,7 @@ import argparse

 r = rdb.RethinkDB()

+
 class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
    def __init__(self, servers, db, table):
        self.servers = servers
@ -67,70 +71,78 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
            # XXX inefficient, it gets parsed later, figure out how to
            # short-circuit this step and create the CDXObject directly
            blob = {
-                'url': record['url'],
-                'status': str(record['response_code']),
-                'digest': record['sha1base32'],
-                'length': str(record.get('record_length', '-')),
-                'offset': str(record['offset']),
-                'filename': record['filename'],
+                "url": record["url"],
+                "status": str(record["response_code"]),
+                "digest": record["sha1base32"],
+                "length": str(record.get("record_length", "-")),
+                "offset": str(record["offset"]),
+                "filename": record["filename"],
            }
-            if record['warc_type'] != 'revisit':
-                blob['mime'] = record['content_type'] or '-'
+            if record["warc_type"] != "revisit":
+                blob["mime"] = record["content_type"] or "-"
            else:
-                blob['mime'] = 'warc/revisit'
+                blob["mime"] = "warc/revisit"
            # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
-            cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
-                    record['canon_surt'], record['timestamp'],
-                    json.dumps(blob))
-            yield cdx_line.encode('utf-8')
+            cdx_line = "{} {:%Y%m%d%H%M%S} {}".format(
+                record["canon_surt"], record["timestamp"], json.dumps(blob)
+            )
+            yield cdx_line.encode("utf-8")

    def _query_rethinkdb(self, cdx_query):
-        start_key = cdx_query.key.decode('utf-8')
-        end_key = cdx_query.end_key.decode('utf-8')
+        start_key = cdx_query.key.decode("utf-8")
+        end_key = cdx_query.end_key.decode("utf-8")
        reql = self.rr.table(self.table).between(
-                [start_key[:150], r.minval], [end_key[:150], r.maxval],
-                index='abbr_canon_surt_timestamp', right_bound='closed')
-        reql = reql.order_by(index='abbr_canon_surt_timestamp')
+            [start_key[:150], r.minval],
+            [end_key[:150], r.maxval],
+            index="abbr_canon_surt_timestamp",
+            right_bound="closed",
+        )
+        reql = reql.order_by(index="abbr_canon_surt_timestamp")
        # TODO support for POST, etc
        # http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
        reql = reql.filter(
-                lambda capture: r.expr(
-                    ['WARCPROX_WRITE_RECORD','GET']).contains(
-                        capture['http_method']))
+            lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains(
+                capture["http_method"]
+            )
+        )
        reql = reql.filter(
-                lambda capture: (capture['canon_surt'] >= start_key)
-                                 & (capture['canon_surt'] < end_key))
+            lambda capture: (capture["canon_surt"] >= start_key)
+            & (capture["canon_surt"] < end_key)
+        )
        if cdx_query.limit:
            reql = reql.limit(cdx_query.limit)
-        logging.debug('rethinkdb query: %s', reql)
+        logging.debug("rethinkdb query: %s", reql)
        results = reql.run()
        return results

+
 class TheGoodUrlCanonicalizer(object):
-    '''
+    """
    Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
    surts with scheme and with trailing comma, and does not "massage"
    www.foo.org into foo.org.
-    '''
+    """
+
    def __init__(self, surt_ordered=True):
-        '''We are always surt ordered (surt_ordered param is ignored)'''
+        """We are always surt ordered (surt_ordered param is ignored)"""
        self.surt_ordered = True

    def __call__(self, url):
        try:
-            key = urlcanon.semantic(url).surt().decode('ascii')
+            key = urlcanon.semantic(url).surt().decode("ascii")
            # logging.debug('%s -> %s', url, key)
            return key
        except Exception as e:
            return url

    def replace_default_canonicalizer():
-        '''Replace parent class of CustomUrlCanonicalizer with this class.'''
+        """Replace parent class of CustomUrlCanonicalizer with this class."""
        pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
-                TheGoodUrlCanonicalizer,)
+            TheGoodUrlCanonicalizer,
+        )

    def good_surts_from_default(default_surt):
-        '''
+        """
        Takes a standard surt without scheme and without trailing comma, and
        returns a list of "good" surts that together match the same set of
        urls. For example:
@ -144,59 +156,64 @@ class TheGoodUrlCanonicalizer(object):
             'http://(com,example,www,)/path',
             'https://(com,example,www,)/path']

-        '''
-        if default_surt == '':
-            return ['']
+        """
+        if default_surt == "":
+            return [""]

-        parts = default_surt.split(')', 1)
+        parts = default_surt.split(")", 1)
        if len(parts) == 2:
            orig_host_part, path_part = parts
            good_surts = [
-                'http://(%s,)%s' % (orig_host_part, path_part),
-                'https://(%s,)%s' % (orig_host_part, path_part),
-                'http://(%s,www,)%s' % (orig_host_part, path_part),
-                'https://(%s,www,)%s' % (orig_host_part, path_part),
+                "http://(%s,)%s" % (orig_host_part, path_part),
+                "https://(%s,)%s" % (orig_host_part, path_part),
+                "http://(%s,www,)%s" % (orig_host_part, path_part),
+                "https://(%s,www,)%s" % (orig_host_part, path_part),
            ]
-        else: # no path part
+        else:  # no path part
            host_part = parts[0]
            good_surts = [
-                'http://(%s' % host_part,
-                'https://(%s' % host_part,
+                "http://(%s" % host_part,
+                "https://(%s" % host_part,
            ]
        return good_surts

    def monkey_patch_dsrules_init():
        orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
+
        def cdx_dsrule_init(self, url_prefix, rules):
            good_surts = []
-            url_prefixes = [url_prefix] if isinstance(
-                    url_prefix, str) else url_prefix
+            url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix
            for bad_surt in url_prefixes:
                good_surts.extend(
-                        TheGoodUrlCanonicalizer.good_surts_from_default(
-                                bad_surt))
-            if 'match' in rules and 'regex' in rules['match']:
-                rules['match']['regex'] = r'https?://\(' + rules['match']['regex']
+                    TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt)
+                )
+            if "match" in rules and "regex" in rules["match"]:
+                rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"]
            orig_init(self, good_surts, rules)
+
        pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init

+
 def support_in_progress_warcs():
-    '''
+    """
    Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
    being written to (warcs having ".open" suffix). This way if a cdx entry
    references foo.warc.gz, pywb will try both foo.warc.gz and
    foo.warc.gz.open.
-    '''
+    """
    _orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
+
    def _prefix_resolver_call(self, filename, cdx=None):
        raw_results = _orig_prefix_resolver_call(self, filename, cdx)
        results = []
        for warc_path in raw_results:
            results.append(warc_path)
-            results.append('%s.open' % warc_path)
+            results.append("%s.open" % warc_path)
        return results
+
    pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call

+
 class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
    def __init__(self, orig_url):
        import re
@ -211,14 +228,14 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
        pywb.rewrite.wburl.BaseWbUrl.__init__(self)

        if six.PY2 and isinstance(orig_url, six.text_type):
-            orig_url = orig_url.encode('utf-8')
+            orig_url = orig_url.encode("utf-8")
            orig_url = quote(orig_url)

        self._original_url = orig_url

        if not self._init_query(orig_url):
            if not self._init_replay(orig_url):
-                raise Exception('Invalid WbUrl: ', orig_url)
+                raise Exception("Invalid WbUrl: ", orig_url)

        new_uri = WbUrl.to_uri(self.url)

@ -227,21 +244,24 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
        self.url = new_uri

        # begin brozzler changes
-        if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
-                or self.url.startswith('thumbnail:')):
+        if (
+            self.url.startswith("urn:")
+            or self.url.startswith("screenshot:")
+            or self.url.startswith("thumbnail:")
+        ):
            return
        # end brozzler changes

        # protocol agnostic url -> http://
        # no protocol -> http://
-        #inx = self.url.find('://')
+        # inx = self.url.find('://')
        inx = -1
        m = self.SCHEME_RX.match(self.url)
        if m:
            inx = m.span(1)[0]

-        #if inx < 0:
-            # check for other partially encoded variants
+        # if inx < 0:
+        # check for other partially encoded variants
        #    m = self.PARTIAL_ENC_RX.match(self.url)
        #    if m:
        #        len_ = len(m.group(0))
@ -253,27 +273,31 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
            self.url = self.DEFAULT_SCHEME + self.url
        else:
            inx += 2
-            if inx < len(self.url) and self.url[inx] != '/':
-                self.url = self.url[:inx] + '/' + self.url[inx:]
+            if inx < len(self.url) and self.url[inx] != "/":
+                self.url = self.url[:inx] + "/" + self.url[inx:]
+

 def _get_wburl_type(self):
    return SomeWbUrl

+
 def monkey_patch_wburl():
    pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type

+
 class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
    def _extend_parser(self, arg_parser):
        super()._extend_parser(arg_parser)
-        arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
+        arg_parser._actions[4].help = argparse.SUPPRESS  # --autoindex
        arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
-        arg_parser.epilog = '''
+        arg_parser.epilog = """
 Run pywb like so:

    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback

 See README.rst for more information.
-'''
+"""
+

 # copied and pasted from cdxdomainspecific.py, only changes are commented as
 # such below
@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query):

    matched_rule = None

-    urlkey = to_native_str(query.key, 'utf-8')
+    urlkey = to_native_str(query.key, "utf-8")
    url = query.url
    filter_ = query.filters
    output = query.output
@ -306,42 +330,42 @@ def _fuzzy_query_call(self, query):
    if not matched_rule:
        return None

-    repl = '?'
+    repl = "?"
    if matched_rule.replace:
        repl = matched_rule.replace

    inx = url.find(repl)
    if inx > 0:
-        url = url[:inx + len(repl)]
+        url = url[: inx + len(repl)]

    # begin brozzler changes
-    if matched_rule.match_type == 'domain':
+    if matched_rule.match_type == "domain":
        orig_split_url = urlsplit(url)
        # remove the subdomain, path, query and fragment
-        host = orig_split_url.netloc.split('.', 1)[1]
-        new_split_url = (orig_split_url.scheme, host, '', '', '')
+        host = orig_split_url.netloc.split(".", 1)[1]
+        new_split_url = (orig_split_url.scheme, host, "", "", "")
        url = urlunsplit(new_split_url)
    # end brozzler changes

    params = query.params
-    params.update({'url': url,
-                   'matchType': matched_rule.match_type,
-                   'filter': filter_})
+    params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_})

-    if 'reverse' in params:
-        del params['reverse']
+    if "reverse" in params:
+        del params["reverse"]

-    if 'closest' in params:
-        del params['closest']
+    if "closest" in params:
+        del params["closest"]

-    if 'end_key' in params:
-        del params['end_key']
+    if "end_key" in params:
+        del params["end_key"]

    return params

+
 def monkey_patch_fuzzy_query():
    pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call

+
 # copied and pasted from pywb/utils/canonicalize.py, only changes are commented
 # as such
 def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
@ -361,54 +385,56 @@ def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):

    start_key = url_canon(url)

-    if match_type == 'exact':
-        end_key = start_key + '!'
+    if match_type == "exact":
+        end_key = start_key + "!"

-    elif match_type == 'prefix':
+    elif match_type == "prefix":
        # add trailing slash if url has it
-        if url.endswith('/') and not start_key.endswith('/'):
-            start_key += '/'
+        if url.endswith("/") and not start_key.endswith("/"):
+            start_key += "/"

        end_key = inc_last_char(start_key)

-    elif match_type == 'host':
+    elif match_type == "host":
        if surt_ordered:
-            host = start_key.split(')/')[0]
+            host = start_key.split(")/")[0]

-            start_key = host + ')/'
-            end_key = host + '*'
+            start_key = host + ")/"
+            end_key = host + "*"
        else:
            host = urlparse.urlsplit(url).netloc

-            start_key = host + '/'
-            end_key = host + '0'
+            start_key = host + "/"
+            end_key = host + "0"

-    elif match_type == 'domain':
+    elif match_type == "domain":
        if not surt_ordered:
-            msg = 'matchType=domain unsupported for non-surt'
+            msg = "matchType=domain unsupported for non-surt"
            raise UrlCanonicalizeException(msg)

-        host = start_key.split(')/')[0]
+        host = start_key.split(")/")[0]

        # if tld, use com, as start_key
        # otherwise, stick with com,example)/
-        if ',' not in host:
-            start_key = host + ','
+        if "," not in host:
+            start_key = host + ","
        else:
-            start_key = host + ')/'
+            start_key = host + ")/"

        # begin brozzler changes
-        end_key = host + '~'
+        end_key = host + "~"
        # end brozzler changes
    else:
-        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
+        raise UrlCanonicalizeException("Invalid match_type: " + match_type)

    return (start_key, end_key)

+
 def monkey_patch_calc_search_range():
    pywb.utils.canonicalize.calc_search_range = _calc_search_range
    pywb.cdx.query.calc_search_range = _calc_search_range

+
 def main(argv=sys.argv):
    brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
    brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
@ -417,7 +443,10 @@ def main(argv=sys.argv):
    brozzler.pywb.monkey_patch_fuzzy_query()
    brozzler.pywb.monkey_patch_calc_search_range()
    wayback_cli = BrozzlerWaybackCli(
-            args=argv[1:], default_port=8880,
-            desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
-                  'with brozzler)'))
+        args=argv[1:],
+        default_port=8880,
+        desc=(
+            "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
+        ),
+    )
    wayback_cli.run()
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/robots.py - robots.txt support

 Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
@ -20,7 +20,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import json
 import logging
@ -34,48 +34,60 @@ __all__ = ["is_permitted_by_robots"]

 # monkey-patch reppy to do substring user-agent matching, see top of file
 reppy.Utility.short_user_agent = lambda strng: strng
+
+
 def _reppy_rules_getitem(self, agent):
-    '''
+    """
    Find the user-agent token matching the supplied full user-agent, using
    a case-insensitive substring search.
-    '''
+    """
    lc_agent = agent.lower()
    for s in self.agents:
        if s in lc_agent:
            return self.agents[s]
-    return self.agents.get('*')
+    return self.agents.get("*")
+
+
 reppy.parser.Rules.__getitem__ = _reppy_rules_getitem

+
 class _SessionRaiseOn420(requests.Session):
    timeout = 60
+
    def get(self, url, *args, **kwargs):
        res = super().get(url, timeout=self.timeout, *args, **kwargs)
-        if res.status_code == 420 and 'warcprox-meta' in res.headers:
+        if res.status_code == 420 and "warcprox-meta" in res.headers:
            raise brozzler.ReachedLimit(
-                    warcprox_meta=json.loads(res.headers['warcprox-meta']),
-                    http_payload=res.text)
+                warcprox_meta=json.loads(res.headers["warcprox-meta"]),
+                http_payload=res.text,
+            )
        else:
            return res

+
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
+
+
 def _robots_cache(site, proxy=None):
    if not site.id in _robots_caches:
        req_sesh = _SessionRaiseOn420()
-        req_sesh.verify = False   # ignore cert errors
+        req_sesh.verify = False  # ignore cert errors
        if proxy:
            proxie = "http://%s" % proxy
-            req_sesh.proxies = {"http":proxie,"https":proxie}
+            req_sesh.proxies = {"http": proxie, "https": proxie}
        if site.extra_headers():
            req_sesh.headers.update(site.extra_headers())
        if site.user_agent:
-            req_sesh.headers['User-Agent'] = site.user_agent
+            req_sesh.headers["User-Agent"] = site.user_agent
        _robots_caches[site.id] = reppy.cache.RobotsCache(
-                session=req_sesh, disallow_forbidden=False)
+            session=req_sesh, disallow_forbidden=False
+        )

    return _robots_caches[site.id]

+
 def is_permitted_by_robots(site, url, proxy=None):
-    '''
+    """
    Checks if `url` is permitted by robots.txt.

    Treats any kind of error fetching robots.txt as "allow all". See
@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None):
    Raises:
        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
        requests.exceptions.ProxyError: if the proxy is down
-    '''
+    """
    if site.ignore_robots:
        return True

    try:
-        result = _robots_cache(site, proxy).allowed(
-                url, site.user_agent or "brozzler")
+        result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler")
        return result
    except Exception as e:
        if isinstance(e, reppy.exceptions.ServerError) and isinstance(
-                e.args[0], brozzler.ReachedLimit):
+            e.args[0], brozzler.ReachedLimit
+        ):
            raise e.args[0]
-        elif hasattr(e, 'args') and isinstance(
-                e.args[0], requests.exceptions.ProxyError):
+        elif hasattr(e, "args") and isinstance(
+            e.args[0], requests.exceptions.ProxyError
+        ):
            # reppy has wrapped an exception that we want to bubble up
            raise brozzler.ProxyError(e)
        else:
            logging.warning(
-                    "returning true (permitted) after problem fetching "
-                    "robots.txt for %r: %r", url, e)
+                "returning true (permitted) after problem fetching "
+                "robots.txt for %r: %r",
+                url,
+                e,
+            )
            return True
-
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
 it runs yt-dlp on them, browses them and runs behaviors if appropriate,
 scopes and adds outlinks to the frontier
@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import logging
 import brozzler
@ -39,6 +39,7 @@ from . import ydl

 r = rdb.RethinkDB()

+
 class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)

@ -50,13 +51,26 @@ class BrozzlerWorker:
    SITE_SESSION_MINUTES = 15

    def __init__(
-            self, frontier, service_registry=None, max_browsers=1,
-            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
-            skip_extract_outlinks=False, skip_visit_hashtags=False,
-            skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
-            page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
-            download_throughput=-1, stealth=False,
-            window_height=900, window_width=1400):
+        self,
+        frontier,
+        service_registry=None,
+        max_browsers=1,
+        chrome_exe="chromium-browser",
+        warcprox_auto=False,
+        proxy=None,
+        skip_extract_outlinks=False,
+        skip_visit_hashtags=False,
+        skip_youtube_dl=False,
+        simpler404=False,
+        screenshot_full_page=False,
+        page_timeout=300,
+        behavior_timeout=900,
+        extract_outlinks_timeout=60,
+        download_throughput=-1,
+        stealth=False,
+        window_height=900,
+        window_width=1400,
+    ):
        self._frontier = frontier
        self._service_registry = service_registry
        self._max_browsers = max_browsers
@ -79,7 +93,8 @@ class BrozzlerWorker:
        self._stealth = stealth

        self._browser_pool = brozzler.browser.BrowserPool(
-                max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
+            max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
+        )
        self._browsing_threads = set()
        self._browsing_threads_lock = threading.Lock()

@ -88,24 +103,32 @@ class BrozzlerWorker:
        self._shutdown = threading.Event()

    def _choose_warcprox(self):
-        warcproxes = self._service_registry.available_services('warcprox')
+        warcproxes = self._service_registry.available_services("warcprox")
        if not warcproxes:
            return None
        # .group('proxy').count() makes this query about 99% more efficient
-        reql = self._frontier.rr.table('sites').between(
-                ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
-                index='sites_last_disclaimed').group('proxy').count()
-        # returns results like 
+        reql = (
+            self._frontier.rr.table("sites")
+            .between(
+                ["ACTIVE", r.minval],
+                ["ACTIVE", r.maxval],
+                index="sites_last_disclaimed",
+            )
+            .group("proxy")
+            .count()
+        )
+        # returns results like
        # {
        #    "wbgrp-svc030.us.archive.org:8000": 148,
        #    "wbgrp-svc030.us.archive.org:8001": 145
        # }
        proxy_scoreboard = dict(reql.run())
        for warcprox in warcproxes:
-            address = '%s:%s' % (warcprox['host'], warcprox['port'])
-            warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
-        warcproxes.sort(key=lambda warcprox: (
-            warcprox['assigned_sites'], warcprox['load']))
+            address = "%s:%s" % (warcprox["host"], warcprox["port"])
+            warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0)
+        warcproxes.sort(
+            key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"])
+        )
        # XXX make this heuristic more advanced?
        return warcproxes[0]

@ -118,13 +141,15 @@ class BrozzlerWorker:
            svc = self._choose_warcprox()
            if svc is None:
                raise brozzler.ProxyError(
-                        'no available instances of warcprox in the service '
-                        'registry')
-            site.proxy = '%s:%s' % (svc['host'], svc['port'])
+                    "no available instances of warcprox in the service " "registry"
+                )
+            site.proxy = "%s:%s" % (svc["host"], svc["port"])
            site.save()
            self.logger.info(
-                    'chose warcprox instance %r from service registry for %r',
-                    site.proxy, site)
+                "chose warcprox instance %r from service registry for %r",
+                site.proxy,
+                site,
+            )
            return site.proxy
        return None

@ -132,14 +157,16 @@ class BrozzlerWorker:
        if self._proxy:
            if self._proxy_is_warcprox is None:
                try:
-                    response = requests.get('http://%s/status' % self._proxy)
+                    response = requests.get("http://%s/status" % self._proxy)
                    status = json.loads(response.text)
-                    self._proxy_is_warcprox = (status['role'] == 'warcprox')
+                    self._proxy_is_warcprox = status["role"] == "warcprox"
                except Exception as e:
                    self._proxy_is_warcprox = False
                logging.info(
-                        '%s %s warcprox', self._proxy,
-                        'IS' if self._proxy_is_warcprox else 'IS NOT')
+                    "%s %s warcprox",
+                    self._proxy,
+                    "IS" if self._proxy_is_warcprox else "IS NOT",
+                )
            return self._proxy_is_warcprox
        else:
            # I should have commented when I originally wrote this code, but I
@ -148,13 +175,20 @@ class BrozzlerWorker:
            return bool(site.proxy or self._warcprox_auto)

    def _warcprox_write_record(
-            self, warcprox_address, url, warc_type, content_type,
-            payload, extra_headers=None):
-        headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
+        self,
+        warcprox_address,
+        url,
+        warc_type,
+        content_type,
+        payload,
+        extra_headers=None,
+    ):
+        headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
        if extra_headers:
            headers.update(extra_headers)
-        request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
-                headers=headers, data=payload)
+        request = urllib.request.Request(
+            url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload
+        )

        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
@ -165,26 +199,31 @@ class BrozzlerWorker:
            with urllib.request.urlopen(request, timeout=600) as response:
                if response.getcode() != 204:
                    self.logger.warning(
-                            'got "%s %s" response on warcprox '
-                            'WARCPROX_WRITE_RECORD request (expected 204)',
-                            response.getcode(), response.reason)
+                        'got "%s %s" response on warcprox '
+                        "WARCPROX_WRITE_RECORD request (expected 204)",
+                        response.getcode(),
+                        response.reason,
+                    )
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warning(
-                    'got "%s %s" response on warcprox '
-                    'WARCPROX_WRITE_RECORD request (expected 204)',
-                    e.getcode(), e.info())
+                'got "%s %s" response on warcprox '
+                "WARCPROX_WRITE_RECORD request (expected 204)",
+                e.getcode(),
+                e.info(),
+            )
            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
-                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
+                "proxy error on WARCPROX_WRITE_RECORD %s" % url
+            ) from e
        except ConnectionError as e:
            raise brozzler.ProxyError(
-                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
+                "proxy error on WARCPROX_WRITE_RECORD %s" % url
+            ) from e

    def thumb_jpeg(self, full_jpeg):
-        """Create JPEG thumbnail.
-        """
+        """Create JPEG thumbnail."""
        img = PIL.Image.open(io.BytesIO(full_jpeg))
        thumb_width = 300
        thumb_height = (thumb_width / img.size[0]) * img.size[1]
@ -193,8 +232,15 @@ class BrozzlerWorker:
        img.save(out, "jpeg", quality=95)
        return out.getbuffer()

-    def brozzle_page(self, browser, site, page, on_screenshot=None,
-                     on_request=None, enable_youtube_dl=True):
+    def brozzle_page(
+        self,
+        browser,
+        site,
+        page,
+        on_screenshot=None,
+        on_request=None,
+        enable_youtube_dl=True,
+    ):
        self.logger.info("brozzling {}".format(page))
        ydl_fetches = None
        outlinks = set()
@ -208,31 +254,38 @@ class BrozzlerWorker:
            except brozzler.ProxyError:
                raise
            except Exception as e:
-                if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
-                        and hasattr(e.exc_info[1], 'code')
-                        and e.exc_info[1].code == 430):
+                if (
+                    hasattr(e, "exc_info")
+                    and len(e.exc_info) >= 2
+                    and hasattr(e.exc_info[1], "code")
+                    and e.exc_info[1].code == 430
+                ):
                    self.logger.info(
-                            'youtube-dl got %s %s processing %s',
-                            e.exc_info[1].code, e.exc_info[1].msg, page.url)
+                        "youtube-dl got %s %s processing %s",
+                        e.exc_info[1].code,
+                        e.exc_info[1].msg,
+                        page.url,
+                    )
                else:
                    self.logger.error(
-                            'youtube_dl raised exception on %s', page,
-                            exc_info=True)
+                        "youtube_dl raised exception on %s", page, exc_info=True
+                    )

        if self._needs_browsing(page, ydl_fetches):
-            self.logger.info('needs browsing: %s', page)
+            self.logger.info("needs browsing: %s", page)
            try:
                browser_outlinks = self._browse_page(
-                    browser, site, page, on_screenshot, on_request)
+                    browser, site, page, on_screenshot, on_request
+                )
                outlinks.update(browser_outlinks)
            except brozzler.PageInterstitialShown:
-                self.logger.info('page interstitial shown (http auth): %s', page)
+                self.logger.info("page interstitial shown (http auth): %s", page)
        else:
            if not self._already_fetched(page, ydl_fetches):
-                self.logger.info('needs fetch: %s', page)
+                self.logger.info("needs fetch: %s", page)
                self._fetch_url(site, page=page)
            else:
-                self.logger.info('already fetched: %s', page)
+                self.logger.info("already fetched: %s", page)

        return outlinks

@ -242,85 +295,103 @@ class BrozzlerWorker:
                on_screenshot(screenshot_jpeg)
            if self._using_warcprox(site):
                self.logger.info(
-                        "sending WARCPROX_WRITE_RECORD request to %s with "
-                        "screenshot for %s", self._proxy_for(site), page)
+                    "sending WARCPROX_WRITE_RECORD request to %s with "
+                    "screenshot for %s",
+                    self._proxy_for(site),
+                    page,
+                )
                thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
                self._warcprox_write_record(
-                        warcprox_address=self._proxy_for(site),
-                        url="screenshot:%s" % str(urlcanon.semantic(page.url)),
-                        warc_type="resource", content_type="image/jpeg",
-                        payload=screenshot_jpeg,
-                        extra_headers=site.extra_headers(page))
+                    warcprox_address=self._proxy_for(site),
+                    url="screenshot:%s" % str(urlcanon.semantic(page.url)),
+                    warc_type="resource",
+                    content_type="image/jpeg",
+                    payload=screenshot_jpeg,
+                    extra_headers=site.extra_headers(page),
+                )
                self._warcprox_write_record(
-                        warcprox_address=self._proxy_for(site),
-                        url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
-                        warc_type="resource", content_type="image/jpeg",
-                        payload=thumbnail_jpeg,
-                        extra_headers=site.extra_headers(page))
+                    warcprox_address=self._proxy_for(site),
+                    url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
+                    warc_type="resource",
+                    content_type="image/jpeg",
+                    payload=thumbnail_jpeg,
+                    extra_headers=site.extra_headers(page),
+                )

        def _on_response(chrome_msg):
-            if ('params' in chrome_msg
-                    and 'response' in chrome_msg['params']
-                    and 'mimeType' in chrome_msg['params']['response']
-                    and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
-                    # skip manifests of DASH segmented video -
-                    # see https://github.com/internetarchive/brozzler/pull/70
-                    and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
-                    and chrome_msg['params']['response'].get('status') in (200, 206)):
+            if (
+                "params" in chrome_msg
+                and "response" in chrome_msg["params"]
+                and "mimeType" in chrome_msg["params"]["response"]
+                and chrome_msg["params"]["response"]
+                .get("mimeType", "")
+                .startswith("video/")
+                # skip manifests of DASH segmented video -
+                # see https://github.com/internetarchive/brozzler/pull/70
+                and chrome_msg["params"]["response"]["mimeType"]
+                != "video/vnd.mpeg.dash.mpd"
+                and chrome_msg["params"]["response"].get("status") in (200, 206)
+            ):
                video = {
-                    'blame': 'browser',
-                    'url': chrome_msg['params']['response'].get('url'),
-                    'response_code': chrome_msg['params']['response']['status'],
-                    'content-type': chrome_msg['params']['response']['mimeType'],
+                    "blame": "browser",
+                    "url": chrome_msg["params"]["response"].get("url"),
+                    "response_code": chrome_msg["params"]["response"]["status"],
+                    "content-type": chrome_msg["params"]["response"]["mimeType"],
                }
                response_headers = CaseInsensitiveDict(
-                        chrome_msg['params']['response']['headers'])
-                if 'content-length' in response_headers:
-                    video['content-length'] = int(response_headers['content-length'])
-                if 'content-range' in response_headers:
-                    video['content-range'] = response_headers['content-range']
-                logging.debug('embedded video %s', video)
-                if not 'videos' in page:
+                    chrome_msg["params"]["response"]["headers"]
+                )
+                if "content-length" in response_headers:
+                    video["content-length"] = int(response_headers["content-length"])
+                if "content-range" in response_headers:
+                    video["content-range"] = response_headers["content-range"]
+                logging.debug("embedded video %s", video)
+                if not "videos" in page:
                    page.videos = []
                page.videos.append(video)

        sw_fetched = set()
+
        def _on_service_worker_version_updated(chrome_msg):
            # https://github.com/internetarchive/brozzler/issues/140
-            self.logger.trace('%r', chrome_msg)
-            if chrome_msg.get('params', {}).get('versions'):
-                url = chrome_msg.get('params', {}).get('versions')[0]\
-                        .get('scriptURL')
+            self.logger.trace("%r", chrome_msg)
+            if chrome_msg.get("params", {}).get("versions"):
+                url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
                if url and url not in sw_fetched:
-                    self.logger.info('fetching service worker script %s', url)
+                    self.logger.info("fetching service worker script %s", url)
                    self._fetch_url(site, url=url)
                    sw_fetched.add(url)

        if not browser.is_running():
            browser.start(
-                    proxy=self._proxy_for(site),
-                    cookie_db=site.get('cookie_db'),
-                    window_height=self._window_height,
-                    window_width=self._window_width)
+                proxy=self._proxy_for(site),
+                cookie_db=site.get("cookie_db"),
+                window_height=self._window_height,
+                window_width=self._window_width,
+            )
        final_page_url, outlinks = browser.browse_page(
-                page.url, extra_headers=site.extra_headers(page),
-                behavior_parameters=site.get('behavior_parameters'),
-                username=site.get('username'), password=site.get('password'),
-                user_agent=site.get('user_agent'),
-                on_screenshot=_on_screenshot, on_response=_on_response,
-                on_request=on_request,
-                on_service_worker_version_updated=_on_service_worker_version_updated,
-                hashtags=page.hashtags,
-                skip_extract_outlinks=self._skip_extract_outlinks,
-                skip_visit_hashtags=self._skip_visit_hashtags,
-                skip_youtube_dl=self._skip_youtube_dl,
-                simpler404=self._simpler404,
-                screenshot_full_page=self._screenshot_full_page,
-                page_timeout=self._page_timeout,
-                behavior_timeout=self._behavior_timeout,
-                extract_outlinks_timeout=self._extract_outlinks_timeout,
-                download_throughput=self._download_throughput,
-                stealth=self._stealth)
+            page.url,
+            extra_headers=site.extra_headers(page),
+            behavior_parameters=site.get("behavior_parameters"),
+            username=site.get("username"),
+            password=site.get("password"),
+            user_agent=site.get("user_agent"),
+            on_screenshot=_on_screenshot,
+            on_response=_on_response,
+            on_request=on_request,
+            on_service_worker_version_updated=_on_service_worker_version_updated,
+            hashtags=page.hashtags,
+            skip_extract_outlinks=self._skip_extract_outlinks,
+            skip_visit_hashtags=self._skip_visit_hashtags,
+            skip_youtube_dl=self._skip_youtube_dl,
+            simpler404=self._simpler404,
+            screenshot_full_page=self._screenshot_full_page,
+            page_timeout=self._page_timeout,
+            behavior_timeout=self._behavior_timeout,
+            extract_outlinks_timeout=self._extract_outlinks_timeout,
+            download_throughput=self._download_throughput,
+            stealth=self._stealth,
+        )
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
@ -328,22 +399,21 @@ class BrozzlerWorker:
    def _fetch_url(self, site, url=None, page=None):
        proxies = None
        if page:
-            url=page.url
+            url = page.url
        if self._proxy_for(site):
            proxies = {
-                'http': 'http://%s' % self._proxy_for(site),
-                'https': 'http://%s' % self._proxy_for(site),
+                "http": "http://%s" % self._proxy_for(site),
+                "https": "http://%s" % self._proxy_for(site),
            }

-        self.logger.info('fetching %s', url)
+        self.logger.info("fetching %s", url)
        try:
            # response is ignored
            requests.get(
-                    url, proxies=proxies, headers=site.extra_headers(page),
-                    verify=False)
+                url, proxies=proxies, headers=site.extra_headers(page), verify=False
+            )
        except requests.exceptions.ProxyError as e:
-            raise brozzler.ProxyError(
-                    'proxy error fetching %s' % url) from e
+            raise brozzler.ProxyError("proxy error fetching %s" % url) from e

    def _needs_browsing(self, page, ydl_fetches):
        if ydl_fetches:
@ -351,8 +421,10 @@ class BrozzlerWorker:
            if not final_bounces:
                return True
            for txn in final_bounces:
-                if txn['response_headers'].get_content_type() in [
-                        'text/html', 'application/xhtml+xml']:
+                if txn["response_headers"].get_content_type() in [
+                    "text/html",
+                    "application/xhtml+xml",
+                ]:
                    return True
            return False
        else:
@ -361,14 +433,13 @@ class BrozzlerWorker:
    def _already_fetched(self, page, ydl_fetches):
        if ydl_fetches:
            for fetch in ydl.final_bounces(ydl_fetches, page.url):
-                if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
+                if fetch["method"] == "GET" and fetch["response_code"] == 200:
                    return True
        return False

    def brozzle_site(self, browser, site):
        try:
-            site.last_claimed_by = '%s:%s' % (
-                    socket.gethostname(), browser.chrome.port)
+            site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
            site.save()
            start = time.time()
            page = None
@ -377,28 +448,28 @@ class BrozzlerWorker:
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
            self.logger.info(
-                    "brozzling site (proxy=%r) %s",
-                    self._proxy_for(site), site)
+                "brozzling site (proxy=%r) %s", self._proxy_for(site), site
+            )
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
                self._frontier.enforce_time_limit(site)
                self._frontier.honor_stop_request(site)
-                page = self._frontier.claim_page(site, "%s:%s" % (
-                    socket.gethostname(), browser.chrome.port))
+                page = self._frontier.claim_page(
+                    site, "%s:%s" % (socket.gethostname(), browser.chrome.port)
+                )

-                if (page.needs_robots_check and
-                        not brozzler.is_permitted_by_robots(
-                            site, page.url, self._proxy_for(site))):
+                if page.needs_robots_check and not brozzler.is_permitted_by_robots(
+                    site, page.url, self._proxy_for(site)
+                ):
                    logging.warning("page %s is blocked by robots.txt", page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
                    outlinks = self.brozzle_page(
-                            browser, site, page,
-                            enable_youtube_dl=not self._skip_youtube_dl)
+                        browser, site, page, enable_youtube_dl=not self._skip_youtube_dl
+                    )
                    self._frontier.completed_page(site, page)
-                    self._frontier.scope_and_schedule_outlinks(
-                            site, page, outlinks)
+                    self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db()

@ -418,31 +489,36 @@ class BrozzlerWorker:
        except brozzler.ProxyError as e:
            if self._warcprox_auto:
                logging.error(
-                        'proxy error (site.proxy=%s), will try to choose a '
-                        'healthy instance next time site is brozzled: %s',
-                        site.proxy, e)
+                    "proxy error (site.proxy=%s), will try to choose a "
+                    "healthy instance next time site is brozzled: %s",
+                    site.proxy,
+                    e,
+                )
                site.proxy = None
            else:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
-                logging.error(
-                        'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
+                logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
        except:
            self.logger.error(
-                    'unexpected exception site=%r page=%r', site, page,
-                    exc_info=True)
+                "unexpected exception site=%r page=%r", site, page, exc_info=True
+            )
            if page:
                page.failed_attempts = (page.failed_attempts or 0) + 1
                if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
                    self.logger.info(
-                            'marking page "completed" after %s unexpected '
-                            'exceptions attempting to brozzle %s',
-                            page.failed_attempts, page)
+                        'marking page "completed" after %s unexpected '
+                        "exceptions attempting to brozzle %s",
+                        page.failed_attempts,
+                        page,
+                    )
                    self._frontier.completed_page(site, page)
                    page = None
        finally:
            if start:
-                site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
+                site.active_brozzling_time = (
+                    (site.active_brozzling_time or 0) + time.time() - start
+                )
            self._frontier.disclaim_site(site, page)

    def _brozzle_site_thread_target(self, browser, site):
@ -462,21 +538,25 @@ class BrozzlerWorker:
                "role": "brozzler-worker",
                "ttl": self.HEARTBEAT_INTERVAL * 3,
            }
-        status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
+        status_info["load"] = (
+            1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
+        )
        status_info["browser_pool_size"] = self._browser_pool.size
        status_info["browsers_in_use"] = self._browser_pool.num_in_use()

        try:
            self.status_info = self._service_registry.heartbeat(status_info)
-            self.logger.trace(
-                    "status in service registry: %s", self.status_info)
+            self.logger.trace("status in service registry: %s", self.status_info)
        except r.ReqlError as e:
            self.logger.error(
-                    "failed to send heartbeat and update service registry "
-                    "with info %s: %s", status_info, e)
+                "failed to send heartbeat and update service registry "
+                "with info %s: %s",
+                status_info,
+                e,
+            )

    def _service_heartbeat_if_due(self):
-        '''Sends service registry heartbeat if due'''
+        """Sends service registry heartbeat if due"""
        due = False
        if self._service_registry:
            if not hasattr(self, "status_info"):
@ -489,15 +569,16 @@ class BrozzlerWorker:
            self._service_heartbeat()

    def _start_browsing_some_sites(self):
-        '''
+        """
        Starts browsing some sites.

        Raises:
            NoBrowsersAvailable if none available
-        '''
+        """
        # acquire_multi() raises NoBrowsersAvailable if none available
        browsers = self._browser_pool.acquire_multi(
-                (self._browser_pool.num_available() + 1) // 2)
+            (self._browser_pool.num_available() + 1) // 2
+        )
        try:
            sites = self._frontier.claim_sites(len(browsers))
        except:
@ -507,10 +588,11 @@ class BrozzlerWorker:
        for i in range(len(browsers)):
            if i < len(sites):
                th = threading.Thread(
-                        target=self._brozzle_site_thread_target,
-                        args=(browsers[i], sites[i]),
-                        name="BrozzlingThread:%s" % browsers[i].chrome.port,
-                        daemon=True)
+                    target=self._brozzle_site_thread_target,
+                    args=(browsers[i], sites[i]),
+                    name="BrozzlingThread:%s" % browsers[i].chrome.port,
+                    daemon=True,
+                )
                with self._browsing_threads_lock:
                    self._browsing_threads.add(th)
                th.start()
@ -519,7 +601,8 @@ class BrozzlerWorker:

    def run(self):
        self.logger.notice(
-                'brozzler %s - brozzler-worker starting', brozzler.__version__)
+            "brozzler %s - brozzler-worker starting", brozzler.__version__
+        )
        last_nothing_to_claim = 0
        try:
            while not self._shutdown.is_set():
@ -528,39 +611,38 @@ class BrozzlerWorker:
                    try:
                        self._start_browsing_some_sites()
                    except brozzler.browser.NoBrowsersAvailable:
-                        logging.trace(
-                                "all %s browsers are in use",
-                                self._max_browsers)
+                        logging.trace("all %s browsers are in use", self._max_browsers)
                    except brozzler.NothingToClaim:
                        last_nothing_to_claim = time.time()
                        logging.trace(
-                                "nothing to claim, all available active sites "
-                                "are already claimed by a brozzler worker")
+                            "nothing to claim, all available active sites "
+                            "are already claimed by a brozzler worker"
+                        )
                time.sleep(0.5)

            self.logger.notice("shutdown requested")
        except r.ReqlError as e:
            self.logger.error(
-                    "caught rethinkdb exception, will try to proceed",
-                    exc_info=True)
+                "caught rethinkdb exception, will try to proceed", exc_info=True
+            )
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except:
            self.logger.critical(
-                    "thread exiting due to unexpected exception",
-                    exc_info=True)
+                "thread exiting due to unexpected exception", exc_info=True
+            )
        finally:
            if self._service_registry and hasattr(self, "status_info"):
                try:
                    self._service_registry.unregister(self.status_info["id"])
                except:
                    self.logger.error(
-                            "failed to unregister from service registry",
-                            exc_info=True)
+                        "failed to unregister from service registry", exc_info=True
+                    )

            self.logger.info(
-                    'shutting down %s brozzling threads',
-                    len(self._browsing_threads))
+                "shutting down %s brozzling threads", len(self._browsing_threads)
+            )
            with self._browsing_threads_lock:
                for th in self._browsing_threads:
                    if th.is_alive():
@ -575,11 +657,10 @@ class BrozzlerWorker:
        with self._start_stop_lock:
            if self._thread:
                self.logger.warning(
-                        'ignoring start request because self._thread is '
-                        'not None')
+                    "ignoring start request because self._thread is " "not None"
+                )
                return
-            self._thread = threading.Thread(
-                    target=self.run, name="BrozzlerWorker")
+            self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
            self._thread.start()

    def shutdown_now(self):
@ -590,4 +671,3 @@ class BrozzlerWorker:

    def is_alive(self):
        return self._thread and self._thread.is_alive()
-
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler

 Copyright (C) 2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import logging
 import yt_dlp
@ -31,6 +31,7 @@ import threading

 thread_local = threading.local()

+
 class ExtraHeaderAdder(urllib.request.BaseHandler):
    def __init__(self, extra_headers):
        self.extra_headers = extra_headers
@ -43,6 +44,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
                req.add_header(h, v)
        return req

+
 class YoutubeDLSpy(urllib.request.BaseHandler):
    logger = logging.getLogger(__module__ + "." + __qualname__)

@ -51,10 +53,10 @@ class YoutubeDLSpy(urllib.request.BaseHandler):

    def _http_response(self, request, response):
        fetch = {
-            'url': request.full_url,
-            'method': request.get_method(),
-            'response_code': response.code,
-            'response_headers': response.headers,
+            "url": request.full_url,
+            "method": request.get_method(),
+            "response_code": response.code,
+            "response_headers": response.headers,
        }
        self.fetches.append(fetch)
        return response
@ -64,6 +66,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
    def reset(self):
        self.fetches = []

+
 def final_bounces(fetches, url):
    """
    Resolves redirect chains in `fetches` and returns a list of fetches
@ -73,26 +76,28 @@ def final_bounces(fetches, url):
    """
    redirects = {}
    for fetch in fetches:
-         # XXX check http status 301,302,303,307? check for "uri" header
-         # as well as "location"? see urllib.request.HTTPRedirectHandler
-         if 'location' in fetch['response_headers']:
-             redirects[fetch['url']] = fetch
+        # XXX check http status 301,302,303,307? check for "uri" header
+        # as well as "location"? see urllib.request.HTTPRedirectHandler
+        if "location" in fetch["response_headers"]:
+            redirects[fetch["url"]] = fetch

    final_url = url
    while final_url in redirects:
        fetch = redirects.pop(final_url)
        final_url = urllib.parse.urljoin(
-                fetch['url'], fetch['response_headers']['location'])
+            fetch["url"], fetch["response_headers"]["location"]
+        )

    final_bounces = []
    for fetch in fetches:
-        if fetch['url'] == final_url:
+        if fetch["url"] == final_url:
            final_bounces.append(fetch)

    return final_bounces

+
 def _build_youtube_dl(worker, destdir, site, page):
-    '''
+    """
    Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.

    The `YoutubeDL` instance does a few special brozzler-specific things:
@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page):

    Returns:
        a yt-dlp `yt_dlp.YoutubeDL` instance
-    '''
+    """

    class _YoutubeDL(yt_dlp.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)
@ -117,31 +122,38 @@ def _build_youtube_dl(worker, destdir, site, page):
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
-            if ie_result.get('_type') == 'playlist':
-                self.logger.info(
-                        'extractor %r found playlist in %s', ie.IE_NAME, url)
-                if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
+            if ie_result.get("_type") == "playlist":
+                self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
+                if ie.IE_NAME in {
+                    "youtube:playlist",
+                    "youtube:tab",
+                    "soundcloud:user",
+                    "instagram:user",
+                }:
                    # At this point ie_result['entries'] is an iterator that
                    # will fetch more metadata from youtube to list all the
                    # videos. We unroll that iterator here partly because
                    # otherwise `process_ie_result()` will clobber it, and we
                    # use it later to extract the watch pages as outlinks.
                    try:
-                        ie_result['entries_no_dl'] = list(ie_result['entries'])
+                        ie_result["entries_no_dl"] = list(ie_result["entries"])
                    except Exception as e:
                        self.logger.warning(
-                                "failed to unroll ie_result['entries']? for %s, %s; exception %s",
-                                ie.IE_NAME, url, e)
-                        ie_result['entries_no_dl'] =[]
-                    ie_result['entries'] = []
+                            "failed to unroll ie_result['entries']? for %s, %s; exception %s",
+                            ie.IE_NAME,
+                            url,
+                            e,
+                        )
+                        ie_result["entries_no_dl"] = []
+                    ie_result["entries"] = []
                    self.logger.info(
-                            'not downloading %s media files from this '
-                            'playlist because we expect to capture them from '
-                            'individual watch/track/detail pages',
-                            len(ie_result['entries_no_dl']))
+                        "not downloading %s media files from this "
+                        "playlist because we expect to capture them from "
+                        "individual watch/track/detail pages",
+                        len(ie_result["entries_no_dl"]),
+                    )
            else:
-                self.logger.info(
-                        'extractor %r found a download in %s', ie.IE_NAME, url)
+                self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)

        def _push_video_to_warcprox(self, site, info_dict, postprocessor):
            # 220211 update: does yt-dlp supply content-type? no, not as such
@ -150,73 +162,96 @@ def _build_youtube_dl(worker, destdir, site, page):
            # youtube-dl produces a stitched-up video that /usr/bin/file fails
            # to identify (says "application/octet-stream"). `ffprobe` doesn't
            # give us a mimetype.
-            if info_dict.get('ext') == 'mp4':
-                mimetype = 'video/mp4'
+            if info_dict.get("ext") == "mp4":
+                mimetype = "video/mp4"
            else:
                try:
                    import magic
-                    mimetype = magic.from_file(info_dict['filepath'], mime=True)
+
+                    mimetype = magic.from_file(info_dict["filepath"], mime=True)
                except ImportError as e:
-                    mimetype = 'video/%s' % info_dict['ext']
-                    self.logger.warning(
-                            'guessing mimetype %s because %r', mimetype, e)
+                    mimetype = "video/%s" % info_dict["ext"]
+                    self.logger.warning("guessing mimetype %s because %r", mimetype, e)

            # youtube watch page postprocessor is MoveFiles

-            if postprocessor == 'FixupM3u8' or postprocessor == 'Merger':
-                url = 'youtube-dl:%05d:%s' % (
-                       info_dict.get('playlist_index') or 1,
-                       info_dict['webpage_url'])
+            if postprocessor == "FixupM3u8" or postprocessor == "Merger":
+                url = "youtube-dl:%05d:%s" % (
+                    info_dict.get("playlist_index") or 1,
+                    info_dict["webpage_url"],
+                )
            else:
-                url = info_dict.get('url', '')
+                url = info_dict.get("url", "")

            # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
-            if url.endswith('.m3u8') or url == '':
+            if url.endswith(".m3u8") or url == "":
                return

-            size = os.path.getsize(info_dict['filepath'])
+            size = os.path.getsize(info_dict["filepath"])
            self.logger.info(
-                    'pushing %r video as %s (%s bytes) to '
-                    'warcprox at %s with url %s', info_dict['format'],
-                    mimetype, size, worker._proxy_for(site), url)
-            with open(info_dict['filepath'], 'rb') as f:
+                "pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
+                info_dict["format"],
+                mimetype,
+                size,
+                worker._proxy_for(site),
+                url,
+            )
+            with open(info_dict["filepath"], "rb") as f:
                # include content-length header to avoid chunked
                # transfer, which warcprox currently rejects
                extra_headers = dict(site.extra_headers())
-                extra_headers['content-length'] = size
+                extra_headers["content-length"] = size
                request, response = worker._warcprox_write_record(
-                        warcprox_address=worker._proxy_for(site), url=url,
-                        warc_type='resource', content_type=mimetype, payload=f,
-                        extra_headers=extra_headers)
+                    warcprox_address=worker._proxy_for(site),
+                    url=url,
+                    warc_type="resource",
+                    content_type=mimetype,
+                    payload=f,
+                    extra_headers=extra_headers,
+                )
                # consulted by _remember_videos()
-                ydl.pushed_videos.append({
-                    'url': url,
-                    'response_code': response.code,
-                    'content-type': mimetype,
-                    'content-length': size,
-                })
+                ydl.pushed_videos.append(
+                    {
+                        "url": url,
+                        "response_code": response.code,
+                        "content-type": mimetype,
+                        "content-length": size,
+                    }
+                )

    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
        # in case yt-dlp takes a long time, heartbeat site.last_claimed
        # to prevent another brozzler-worker from claiming the site
        try:
-            if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
+            if (
+                site.rr
+                and doublethink.utcnow() - site.last_claimed
+                > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES)
+            ):
                worker.logger.debug(
-                        'heartbeating site.last_claimed to prevent another '
-                        'brozzler-worker claiming this site id=%r', site.id)
+                    "heartbeating site.last_claimed to prevent another "
+                    "brozzler-worker claiming this site id=%r",
+                    site.id,
+                )
                site.last_claimed = doublethink.utcnow()
                site.save()
        except:
            worker.logger.debug(
-                    'problem heartbeating site.last_claimed site id=%r',
-                    site.id, exc_info=True)
+                "problem heartbeating site.last_claimed site id=%r",
+                site.id,
+                exc_info=True,
+            )

    def ydl_postprocess_hook(d):
-        if d['status'] == 'finished':
-            worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
-            worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
+        if d["status"] == "finished":
+            worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
+            worker.logger.info(
+                "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
+            )
            if worker._using_warcprox(site):
-                _YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
+                _YoutubeDL._push_video_to_warcprox(
+                    _YoutubeDL, site, d["info_dict"], d["postprocessor"]
+                )

    # default socket_timeout is 20 -- we hit it often when cluster is busy
    ydl_opts = {
@ -230,7 +265,6 @@ def _build_youtube_dl(worker, destdir, site, page):
        "socket_timeout": 40,
        "progress_hooks": [maybe_heartbeat_site_last_claimed],
        "postprocessor_hooks": [ydl_postprocess_hook],
-
        # https://github.com/yt-dlp/yt-dlp#format-selection
        # "By default, yt-dlp tries to download the best available quality..."
        # pre-v.2023.07.06: "format_sort": ["ext"],
@ -238,16 +272,13 @@ def _build_youtube_dl(worker, destdir, site, page):
        # recommended: convert working cli to api call with
        # https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py
        "format": "b/bv+ba",
-        "format_sort": ["res:720","vcodec:h264","acodec:aac"],
+        "format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
        # skip live streams
        "match_filter": match_filter_func("!is_live"),
-
-        "extractor_args": {'youtube': {'skip': ['dash', 'hls']}},
-
+        "extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
        # --cache-dir local or..
        # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
        "cache_dir": "/home/archiveit",
-
        "logger": logging.getLogger("yt_dlp"),
        "verbose": False,
        "quiet": False,
@ -265,49 +296,53 @@ def _build_youtube_dl(worker, destdir, site, page):
    ydl._opener.add_handler(ydl.fetch_spy)
    return ydl

+
 def _remember_videos(page, fetches, pushed_videos=None):
-    '''
+    """
    Saves info about videos captured by yt-dlp in `page.videos`.
-    '''
-    if not 'videos' in page:
+    """
+    if not "videos" in page:
        page.videos = []
    for fetch in fetches or []:
-        content_type = fetch['response_headers'].get_content_type()
-        if (content_type.startswith('video/')
-                # skip manifests of DASH segmented video -
-                # see https://github.com/internetarchive/brozzler/pull/70
-                and content_type != 'video/vnd.mpeg.dash.mpd'
-                and fetch['method'] == 'GET'
-                and fetch['response_code'] in (200, 206)):
+        content_type = fetch["response_headers"].get_content_type()
+        if (
+            content_type.startswith("video/")
+            # skip manifests of DASH segmented video -
+            # see https://github.com/internetarchive/brozzler/pull/70
+            and content_type != "video/vnd.mpeg.dash.mpd"
+            and fetch["method"] == "GET"
+            and fetch["response_code"] in (200, 206)
+        ):
            video = {
-                'blame': 'youtube-dl',
-                'url': fetch['url'],
-                'response_code': fetch['response_code'],
-                'content-type': content_type,
+                "blame": "youtube-dl",
+                "url": fetch["url"],
+                "response_code": fetch["response_code"],
+                "content-type": content_type,
            }
-            if 'content-length' in fetch['response_headers']:
-                video['content-length'] = int(
-                        fetch['response_headers']['content-length'])
-            if 'content-range' in fetch['response_headers']:
+            if "content-length" in fetch["response_headers"]:
+                video["content-length"] = int(
+                    fetch["response_headers"]["content-length"]
+                )
+            if "content-range" in fetch["response_headers"]:
                # skip chunked youtube video
-                if 'googlevideo.com/videoplayback' in fetch['url']:
+                if "googlevideo.com/videoplayback" in fetch["url"]:
                    continue
-                video['content-range'] = fetch[
-                        'response_headers']['content-range']
-            logging.debug('embedded video %s', video)
+                video["content-range"] = fetch["response_headers"]["content-range"]
+            logging.debug("embedded video %s", video)
            page.videos.append(video)
    for pushed_video in pushed_videos or []:
-        if pushed_video['content-type'].startswith('video/'):
+        if pushed_video["content-type"].startswith("video/"):
            video = {
-                'blame': 'youtube-dl',
-                'url': pushed_video['url'],
-                'response_code': pushed_video['response_code'],
-                'content-type': pushed_video['content-type'],
-                'content-length': pushed_video['content-length'],
+                "blame": "youtube-dl",
+                "url": pushed_video["url"],
+                "response_code": pushed_video["response_code"],
+                "content-type": pushed_video["content-type"],
+                "content-length": pushed_video["content-length"],
            }
-            logging.debug('embedded video %s', video)
+            logging.debug("embedded video %s", video)
            page.videos.append(video)

+
 def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)
@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page):
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
-            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
+            ie_result = ydl.sanitize_info(
+                ydl.extract_info(str(urlcanon.whatwg(page.url)))
+            )
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
-                    "sending WARCPROX_WRITE_RECORD request to warcprox "
-                    "with yt-dlp json for %s", page)
+                "sending WARCPROX_WRITE_RECORD request to warcprox "
+                "with yt-dlp json for %s",
+                page,
+            )
            worker._warcprox_write_record(
-                    warcprox_address=worker._proxy_for(site),
-                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
-                    warc_type="metadata",
-                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
-                    payload=info_json.encode("utf-8"),
-                    extra_headers=site.extra_headers(page))
+                warcprox_address=worker._proxy_for(site),
+                url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
+                warc_type="metadata",
+                content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
+                payload=info_json.encode("utf-8"),
+                extra_headers=site.extra_headers(page),
+            )
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
            return None
-        elif (hasattr(e, "exc_info")
-                and e.exc_info[0] == urllib.error.HTTPError
-                and hasattr(e.exc_info[1], "code")
-                and e.exc_info[1].code == 420):
+        elif (
+            hasattr(e, "exc_info")
+            and e.exc_info[0] == urllib.error.HTTPError
+            and hasattr(e.exc_info[1], "code")
+            and e.exc_info[1].code == 420
+        ):
            raise brozzler.ReachedLimit(e.exc_info[1])
-        elif (hasattr(e, 'exc_info')
-                and e.exc_info[0] == urllib.error.URLError
-                and worker._proxy_for(site)):
+        elif (
+            hasattr(e, "exc_info")
+            and e.exc_info[0] == urllib.error.URLError
+            and worker._proxy_for(site)
+        ):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
-                    'yt-dlp hit apparent proxy error from '
-                    '%s' % page.url) from e
+                "yt-dlp hit apparent proxy error from " "%s" % page.url
+            ) from e
        else:
            raise

+
 def do_youtube_dl(worker, site, page):
-    '''
+    """
    Runs yt-dlp configured for `worker` and `site` to download videos from
    `page`.

@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page):
                    'response_headers': ...,
                }, ...]
            `list` of `str`: outlink urls
-    '''
-    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+    """
+    with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
        ydl = _build_youtube_dl(worker, tempdir, site, page)
        ie_result = _try_youtube_dl(worker, ydl, site, page)
        outlinks = set()
-        if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or
-                          ie_result.get('extractor') == 'youtube:tab'):
+        if ie_result and (
+            ie_result.get("extractor") == "youtube:playlist"
+            or ie_result.get("extractor") == "youtube:tab"
+        ):
            # youtube watch pages as outlinks
-            outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
-                        for e in ie_result.get('entries_no_dl', [])}
+            outlinks = {
+                "https://www.youtube.com/watch?v=%s" % e["id"]
+                for e in ie_result.get("entries_no_dl", [])
+            }
        # any outlinks for other cases?
        return ydl.fetch_spy.fetches, outlinks
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 setup.py - brozzler setup script

 Copyright (C) 2014-2024 Internet Archive
@ -15,89 +15,88 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import setuptools
 import os

+
 def find_package_data(package):
    pkg_data = []
-    depth = len(package.split('.'))
-    path = os.path.join(*package.split('.'))
+    depth = len(package.split("."))
+    path = os.path.join(*package.split("."))
    for dirpath, dirnames, filenames in os.walk(path):
-        if not os.path.exists(os.path.join(dirpath, '__init__.py')):
+        if not os.path.exists(os.path.join(dirpath, "__init__.py")):
            relpath = os.path.join(*dirpath.split(os.sep)[depth:])
            pkg_data.extend(os.path.join(relpath, f) for f in filenames)
    return pkg_data

+
 setuptools.setup(
-        name='brozzler',
-        version='1.5.44',
-        description='Distributed web crawling with browsers',
-        url='https://github.com/internetarchive/brozzler',
-        author='Noah Levitt',
-        author_email='nlevitt@archive.org',
-        long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
-        license='Apache License 2.0',
-        packages=['brozzler', 'brozzler.dashboard'],
-        package_data={
-            'brozzler': [
-                'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
-            'brozzler.dashboard': find_package_data('brozzler.dashboard'),
-        },
-        entry_points={
-            'console_scripts': [
-                'brozzle-page=brozzler.cli:brozzle_page',
-                'brozzler-new-job=brozzler.cli:brozzler_new_job',
-                'brozzler-new-site=brozzler.cli:brozzler_new_site',
-                'brozzler-worker=brozzler.cli:brozzler_worker',
-                'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
-                'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
-                'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs',
-                'brozzler-list-sites=brozzler.cli:brozzler_list_sites',
-                'brozzler-list-pages=brozzler.cli:brozzler_list_pages',
-                'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl',
-                'brozzler-purge=brozzler.cli:brozzler_purge',
-                'brozzler-dashboard=brozzler.dashboard:main',
-                'brozzler-easy=brozzler.easy:main',
-                'brozzler-wayback=brozzler.pywb:main',
-            ],
-        },
-        install_requires=[
-            'PyYAML>=5.1',
-            'yt_dlp<2023.11.16',
-            'reppy==0.3.4',
-            'requests>=2.21',
-            'websocket-client>=0.39.0,<=0.48.0',
-            'pillow>=5.2.0',
-            'urlcanon>=0.1.dev23',
-            'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311',
-            'rethinkdb<2.4.10',
-            'cerberus>=1.0.1',
-            'jinja2>=2.10',
-            'cryptography>=2.3',
-            'python-magic>=0.4.15',
+    name="brozzler",
+    version="1.5.44",
+    description="Distributed web crawling with browsers",
+    url="https://github.com/internetarchive/brozzler",
+    author="Noah Levitt",
+    author_email="nlevitt@archive.org",
+    long_description=open("README.rst", mode="rb").read().decode("UTF-8"),
+    license="Apache License 2.0",
+    packages=["brozzler", "brozzler.dashboard"],
+    package_data={
+        "brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"],
+        "brozzler.dashboard": find_package_data("brozzler.dashboard"),
+    },
+    entry_points={
+        "console_scripts": [
+            "brozzle-page=brozzler.cli:brozzle_page",
+            "brozzler-new-job=brozzler.cli:brozzler_new_job",
+            "brozzler-new-site=brozzler.cli:brozzler_new_site",
+            "brozzler-worker=brozzler.cli:brozzler_worker",
+            "brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
+            "brozzler-list-captures=brozzler.cli:brozzler_list_captures",
+            "brozzler-list-jobs=brozzler.cli:brozzler_list_jobs",
+            "brozzler-list-sites=brozzler.cli:brozzler_list_sites",
+            "brozzler-list-pages=brozzler.cli:brozzler_list_pages",
+            "brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl",
+            "brozzler-purge=brozzler.cli:brozzler_purge",
+            "brozzler-dashboard=brozzler.dashboard:main",
+            "brozzler-easy=brozzler.easy:main",
+            "brozzler-wayback=brozzler.pywb:main",
        ],
-        extras_require={
-            'dashboard': [
-                'flask>=1.0',
-                'gunicorn>=19.8.1'
-            ],
-            'easy': [
-                'warcprox>=2.4.31',
-                'pywb>=0.33.2,<2',
-                'flask>=1.0',
-                'gunicorn>=19.8.1'
-            ],
-        },
-        zip_safe=False,
-        classifiers=[
-            'Development Status :: 5 - Production/Stable',
-            'Environment :: Console',
-            'License :: OSI Approved :: Apache Software License',
-            'Programming Language :: Python :: 3.5',
-            'Programming Language :: Python :: 3.6',
-            'Programming Language :: Python :: 3.7',
-            'Topic :: Internet :: WWW/HTTP',
-            'Topic :: System :: Archiving',
-        ])
+    },
+    install_requires=[
+        "PyYAML>=5.1",
+        "yt_dlp<2023.11.16",
+        "reppy==0.3.4",
+        "requests>=2.21",
+        "websocket-client>=0.39.0,<=0.48.0",
+        "pillow>=5.2.0",
+        "urlcanon>=0.1.dev23",
+        "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
+        "rethinkdb<2.4.10",
+        "cerberus>=1.0.1",
+        "jinja2>=2.10",
+        "cryptography>=2.3",
+        "python-magic>=0.4.15",
+    ],
+    extras_require={
+        "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
+        "easy": [
+            "warcprox>=2.4.31",
+            "pywb>=0.33.2,<2",
+            "flask>=1.0",
+            "gunicorn>=19.8.1",
+        ],
+    },
+    zip_safe=False,
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Environment :: Console",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Topic :: Internet :: WWW/HTTP",
+        "Topic :: System :: Archiving",
+    ],
+)
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 test_brozzling.py - XXX explain

 Copyright (C) 2016-2018 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import pytest
 import brozzler
@ -34,79 +34,81 @@ args.log_level = logging.INFO
 brozzler.cli.configure_logging(args)

 WARCPROX_META_420 = {
-    'stats': {
-        'test_limits_bucket': {
-            'total': {'urls': 0, 'wire_bytes': 0},
-            'new': {'urls': 0, 'wire_bytes': 0},
-            'revisit': {'urls': 0, 'wire_bytes': 0},
-            'bucket': 'test_limits_bucket'
+    "stats": {
+        "test_limits_bucket": {
+            "total": {"urls": 0, "wire_bytes": 0},
+            "new": {"urls": 0, "wire_bytes": 0},
+            "revisit": {"urls": 0, "wire_bytes": 0},
+            "bucket": "test_limits_bucket",
        }
    },
-    'reached-limit': {'test_limits_bucket/total/urls': 0}
+    "reached-limit": {"test_limits_bucket/total/urls": 0},
 }

-@pytest.fixture(scope='module')
+
+@pytest.fixture(scope="module")
 def httpd(request):
    class RequestHandler(http.server.SimpleHTTPRequestHandler):
        def __init__(self, *args, **kwargs):
-            self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd'
+            self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
            http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)

        def do_GET(self):
-            if self.path == '/420':
-                self.send_response(420, 'Reached limit')
-                self.send_header('Connection', 'close')
-                self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
-                payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
-                self.send_header('Content-Type', 'text/plain;charset=utf-8')
-                self.send_header('Content-Length', len(payload))
+            if self.path == "/420":
+                self.send_response(420, "Reached limit")
+                self.send_header("Connection", "close")
+                self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
+                payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
+                self.send_header("Content-Type", "text/plain;charset=utf-8")
+                self.send_header("Content-Length", len(payload))
                self.end_headers()
                self.wfile.write(payload)
-            elif self.path == '/401':
+            elif self.path == "/401":
                self.send_response(401)
-                self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
-                self.send_header('Content-type', 'text/html')
+                self.send_header("WWW-Authenticate", 'Basic realm="Test"')
+                self.send_header("Content-type", "text/html")
                self.end_headers()
-                self.wfile.write(self.headers.get('Authorization', b''))
-                self.wfile.write(b'not authenticated')
+                self.wfile.write(self.headers.get("Authorization", b""))
+                self.wfile.write(b"not authenticated")
            else:
                super().do_GET()

        def do_POST(self):
-            if self.path == '/login-action':
+            if self.path == "/login-action":
                self.send_response(200)
-                payload = b'login successful\n'
-                self.send_header('Content-Type', 'text/plain;charset=utf-8')
-                self.send_header('Content-Length', len(payload))
+                payload = b"login successful\n"
+                self.send_header("Content-Type", "text/plain;charset=utf-8")
+                self.send_header("Content-Length", len(payload))
                self.end_headers()
                self.wfile.write(payload)
            else:
                super().do_POST()

-
    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
-    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+    os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))

-    httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
+    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()

    def fin():
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
+
    request.addfinalizer(fin)

    return httpd

+
 def test_httpd(httpd):
-    '''
+    """
    Tests that our http server is working as expected, and that two fetches
    of the same url return the same payload, proving it can be used to test
    deduplication.
-    '''
+    """
    payload1 = content2 = None
-    url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
+    url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
        payload1 = response.read()
@ -119,123 +121,136 @@ def test_httpd(httpd):

    assert payload1 == payload2

-    url = 'http://localhost:%s/420' % httpd.server_port
+    url = "http://localhost:%s/420" % httpd.server_port
    with pytest.raises(urllib.error.HTTPError) as excinfo:
        urllib.request.urlopen(url)
    assert excinfo.value.getcode() == 420

+
 def test_aw_snap_hes_dead_jim():
    chrome_exe = brozzler.suggest_default_chrome_exe()
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.BrowsingException):
-            browser.browse_page('chrome://crash')
+            browser.browse_page("chrome://crash")
+

 # chromium's 401 handling changed???
@pytest.mark.xfail
 def test_page_interstitial_exception(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/401' % httpd.server_port
+    url = "http://localhost:%s/401" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.PageInterstitialShown):
            browser.browse_page(url)

+
 def test_on_response(httpd):
    response_urls = []
+
    def on_response(msg):
-        response_urls.append(msg['params']['response']['url'])
+        response_urls.append(msg["params"]["response"]["url"])

    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/site3/page.html' % httpd.server_port
+    url = "http://localhost:%s/site3/page.html" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(url, on_response=on_response)
-    assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
-    assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
-    assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
+    assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
+    assert (
+        response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
+    )
+    assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
+

 def test_420(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/420' % httpd.server_port
+    url = "http://localhost:%s/420" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.ReachedLimit) as excinfo:
            browser.browse_page(url)
        assert excinfo.value.warcprox_meta == WARCPROX_META_420

+
 def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
+    url = "http://localhost:%s/site4/alert.html" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
+        browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
        browser.browse_page(
-                'http://localhost:%s/site4/alert.html' % httpd.server_port)
-        browser.browse_page(
-                'http://localhost:%s/site4/confirm.html' % httpd.server_port)
-        browser.browse_page(
-                'http://localhost:%s/site4/prompt.html' % httpd.server_port)
+            "http://localhost:%s/site4/confirm.html" % httpd.server_port
+        )
+        browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
        # XXX print dialog unresolved
        # browser.browse_page(
        #         'http://localhost:%s/site4/print.html' % httpd.server_port)

+
 def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
-    page = brozzler.Page(None, {
-        'url':'http://localhost:%s/site6/' % httpd.server_port})
+    page = brozzler.Page(
+        None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
+    )
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 4
    assert page.videos[0] == {
-        'blame': 'youtube-dl',
-        'response_code': 200,
-        'content-length': 383631,
-        'content-type': 'video/mp4',
-        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
+        "blame": "youtube-dl",
+        "response_code": 200,
+        "content-length": 383631,
+        "content-type": "video/mp4",
+        "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
    }
    assert page.videos[1] == {
-        'blame': 'youtube-dl',
-        'content-length': 92728,
-        'content-type': 'video/webm',
-        'response_code': 200,
-        'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port
+        "blame": "youtube-dl",
+        "content-length": 92728,
+        "content-type": "video/webm",
+        "response_code": 200,
+        "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
+        % httpd.server_port,
    }
    assert page.videos[2] == {
-        'blame': 'youtube-dl',
-        'content-length': 101114,
-        'content-type': 'video/webm',
-        'response_code': 200,
-        'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port
+        "blame": "youtube-dl",
+        "content-length": 101114,
+        "content-type": "video/webm",
+        "response_code": 200,
+        "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
    }
    assert page.videos[3] == {
-        'blame': 'browser',
+        "blame": "browser",
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
-        'response_code': 200,
-        'content-length': 229455,
-        'content-type': 'video/webm',
-        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
+        "response_code": 200,
+        "content-length": 229455,
+        "content-type": "video/webm",
+        "url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
    }

+
 def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
-    page = brozzler.Page(None, {
-        'url':'http://localhost:%s/site8/' % httpd.server_port})
+    page = brozzler.Page(
+        None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
+    )
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
-        'http://example.com/offsite',
-        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
-        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
-        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
+        "http://example.com/offsite",
+        "http://localhost:%s/site8/baz/zuh" % httpd.server_port,
+        "http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
+        "http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
    }

+
 def test_proxy_down():
-    '''
+    """
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.
@ -243,40 +258,41 @@ def test_proxy_down():
    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
-    '''
+    """
    sock = socket.socket()
-    sock.bind(('127.0.0.1', 0))
-    for not_listening_proxy in (
-            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
-        site = brozzler.Site(None, {'seed':'http://example.com/'})
-        page = brozzler.Page(None, {'url': 'http://example.com/'})
+    sock.bind(("127.0.0.1", 0))
+    for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
+        site = brozzler.Site(None, {"seed": "http://example.com/"})
+        page = brozzler.Page(None, {"url": "http://example.com/"})

-        worker = brozzler.BrozzlerWorker(
-                frontier=None, proxy=not_listening_proxy)
+        worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)

+
 def test_try_login(httpd):
-    """Test try_login behavior.
-    """
+    """Test try_login behavior."""
    response_urls = []
+
    def on_response(msg):
-        response_urls.append(msg['params']['response']['url'])
+        response_urls.append(msg["params"]["response"]["url"])
+
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
-    form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port
-    favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
-    login_url = 'http://localhost:%s/login-action' % httpd.server_port
+    form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
+    form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
+    favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
+    login_url = "http://localhost:%s/login-action" % httpd.server_port
    # When username and password are defined and initial page has login form,
    # detect login form, submit login, and then return to the initial page.
-    username = 'user1'
-    password = 'pass1'
+    username = "user1"
+    password = "pass1"
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        browser.browse_page(form_url, username=username, password=password,
-                            on_response=on_response)
+        browser.browse_page(
+            form_url, username=username, password=password, on_response=on_response
+        )
    assert len(response_urls) == 4
    assert response_urls[0] == form_url
    assert response_urls[1] == favicon_url
@ -285,11 +301,15 @@ def test_try_login(httpd):

    # We are now supporting a different type of form, we'll test that here.
    response_urls = []
-    username = 'user1'
-    password = 'pass1'
+    username = "user1"
+    password = "pass1"
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        browser.browse_page(form_url_other, username=username, password=password,
-                            on_response=on_response)
+        browser.browse_page(
+            form_url_other,
+            username=username,
+            password=password,
+            on_response=on_response,
+        )
    assert len(response_urls) == 4
    assert response_urls[0] == form_url_other
    assert response_urls[1] == favicon_url
@ -306,10 +326,16 @@ def test_try_login(httpd):

    # when the page doesn't have a form with username/password, don't submit it
    response_urls = []
-    form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
+    form_without_login_url = (
+        "http://localhost:%s/site11/form-no-login.html" % httpd.server_port
+    )
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        browser.browse_page(form_without_login_url, username=username,
-                            password=password, on_response=on_response)
+        browser.browse_page(
+            form_without_login_url,
+            username=username,
+            password=password,
+            on_response=on_response,
+        )
    assert len(response_urls) == 2
    assert response_urls[0] == form_without_login_url
    assert response_urls[1] == favicon_url
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 test_cli.py - test brozzler commands

 Copyright (C) 2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import brozzler.cli
 import pkg_resources
@ -23,59 +23,62 @@ import pytest
 import subprocess
 import doublethink

+
 def cli_commands():
-    commands = set(pkg_resources.get_entry_map(
-        'brozzler')['console_scripts'].keys())
-    commands.remove('brozzler-wayback')
+    commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys())
+    commands.remove("brozzler-wayback")
    try:
        import gunicorn
    except ImportError:
-        commands.remove('brozzler-dashboard')
+        commands.remove("brozzler-dashboard")
    try:
        import pywb
    except ImportError:
-        commands.remove('brozzler-easy')
+        commands.remove("brozzler-easy")
    return commands

-@pytest.mark.parametrize('cmd', cli_commands())
+
+@pytest.mark.parametrize("cmd", cli_commands())
 def test_call_entrypoint(capsys, cmd):
-    entrypoint = pkg_resources.get_entry_map(
-            'brozzler')['console_scripts'][cmd]
+    entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd]
    callable = entrypoint.resolve()
    with pytest.raises(SystemExit):
-        callable(['/whatever/bin/%s' % cmd, '--version'])
+        callable(["/whatever/bin/%s" % cmd, "--version"])
    out, err = capsys.readouterr()
-    assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd)
-    assert err == ''
+    assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd)
+    assert err == ""

-@pytest.mark.parametrize('cmd', cli_commands())
+
+@pytest.mark.parametrize("cmd", cli_commands())
 def test_run_command(capsys, cmd):
    proc = subprocess.Popen(
-        [cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        [cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
    out, err = proc.communicate()
-    assert err == b''
-    assert out == ('brozzler %s - %s\n' % (
-        brozzler.__version__, cmd)).encode('ascii')
+    assert err == b""
+    assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
+

 def test_rethinkdb_up():
-    '''Check that rethinkdb is up and running.'''
+    """Check that rethinkdb is up and running."""
    # check that rethinkdb is listening and looks sane
-    rr = doublethink.Rethinker(db='rethinkdb')  # built-in db
+    rr = doublethink.Rethinker(db="rethinkdb")  # built-in db
    tbls = rr.table_list().run()
    assert len(tbls) > 10

+
 # XXX don't know why this test is failing in travis-ci and vagrant while
 # test_call_entrypoint tests pass :( (also fails with capfd)
@pytest.mark.xfail
 def test_stop_nonexistent_crawl(capsys):
    with pytest.raises(SystemExit):
-        brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123'])
+        brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"])
    out, err = capsys.readouterr()
-    assert err.endswith('site not found with id=123\n')
-    assert out == ''
+    assert err.endswith("site not found with id=123\n")
+    assert out == ""

    with pytest.raises(SystemExit):
-        brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc'])
+        brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"])
    out, err = capsys.readouterr()
-    assert err.endswith('''job not found with id='abc'\n''')
-    assert out == ''
+    assert err.endswith("""job not found with id='abc'\n""")
+    assert out == ""
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 test_units.py - some unit tests for parts of brozzler amenable to that

 Copyright (C) 2016-2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import pytest
 import http.server
@ -37,99 +37,131 @@ import threading
 from unittest import mock

 logging.basicConfig(
-        stream=sys.stderr, level=logging.INFO, format=(
-            '%(asctime)s %(process)d %(levelname)s %(threadName)s '
-            '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
+    stream=sys.stderr,
+    level=logging.INFO,
+    format=(
+        "%(asctime)s %(process)d %(levelname)s %(threadName)s "
+        "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
+    ),
+)

-@pytest.fixture(scope='module')
+
+@pytest.fixture(scope="module")
 def httpd(request):
    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
-    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+    os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))

    httpd = http.server.HTTPServer(
-            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+        ("localhost", 0), http.server.SimpleHTTPRequestHandler
+    )
+    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()

    def fin():
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
+
    request.addfinalizer(fin)

    return httpd

+
 def test_robots(httpd):
-    '''
+    """
    Basic test of robots.txt user-agent substring matching.
-    '''
-    url = 'http://localhost:%s/' % httpd.server_port
-    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
+    """
+    url = "http://localhost:%s/" % httpd.server_port
+    site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
    assert brozzler.is_permitted_by_robots(site, url)

-    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
+    site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
    assert not brozzler.is_permitted_by_robots(site, url)

+
 def test_robots_http_statuses():
    for status in (
-            200, 204, 400, 401, 402, 403, 404, 405,
-            500, 501, 502, 503, 504, 505):
+        200,
+        204,
+        400,
+        401,
+        402,
+        403,
+        404,
+        405,
+        500,
+        501,
+        502,
+        503,
+        504,
+        505,
+    ):
+
        class Handler(http.server.BaseHTTPRequestHandler):
            def do_GET(self):
-                response = (('HTTP/1.1 %s Meaningless message\r\n'
-                          + 'Content-length: 0\r\n'
-                          + '\r\n') % status).encode('utf-8')
+                response = (
+                    (
+                        "HTTP/1.1 %s Meaningless message\r\n"
+                        + "Content-length: 0\r\n"
+                        + "\r\n"
+                    )
+                    % status
+                ).encode("utf-8")
                self.connection.sendall(response)
                # self.send_response(status)
                # self.end_headers()
-        httpd = http.server.HTTPServer(('localhost', 0), Handler)
-        httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+
+        httpd = http.server.HTTPServer(("localhost", 0), Handler)
+        httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
        httpd_thread.start()

        try:
-            url = 'http://localhost:%s/' % httpd.server_port
-            site = brozzler.Site(None, {'seed': url})
+            url = "http://localhost:%s/" % httpd.server_port
+            site = brozzler.Site(None, {"seed": url})
            assert brozzler.is_permitted_by_robots(site, url)
        finally:
            httpd.shutdown()
            httpd.server_close()
            httpd_thread.join()

+
 def test_robots_empty_response():
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            self.connection.shutdown(socket.SHUT_RDWR)
            self.connection.close()
-    httpd = http.server.HTTPServer(('localhost', 0), Handler)
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+
+    httpd = http.server.HTTPServer(("localhost", 0), Handler)
+    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()

    try:
-        url = 'http://localhost:%s/' % httpd.server_port
-        site = brozzler.Site(None, {'seed': url})
+        url = "http://localhost:%s/" % httpd.server_port
+        site = brozzler.Site(None, {"seed": url})
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()

+
 def test_robots_socket_timeout():
    stop_hanging = threading.Event()
+
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            stop_hanging.wait(60)
-            self.connection.sendall(
-                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
+            self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")

    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout

-    httpd = http.server.HTTPServer(('localhost', 0), Handler)
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd = http.server.HTTPServer(("localhost", 0), Handler)
+    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()

    try:
-        url = 'http://localhost:%s/' % httpd.server_port
-        site = brozzler.Site(None, {'seed': url})
+        url = "http://localhost:%s/" % httpd.server_port
+        site = brozzler.Site(None, {"seed": url})
        brozzler.robots._SessionRaiseOn420.timeout = 2
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
        httpd.server_close()
        httpd_thread.join()

+
 def test_robots_dns_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
-    url = 'http://whatever.invalid./'
-    site = brozzler.Site(None, {'seed': url})
+    url = "http://whatever.invalid./"
+    site = brozzler.Site(None, {"seed": url})
    assert brozzler.is_permitted_by_robots(site, url)

+
 def test_robots_connection_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
-    url = 'http://localhost:4/' # nobody listens on port 4
-    site = brozzler.Site(None, {'seed': url})
+    url = "http://localhost:4/"  # nobody listens on port 4
+    site = brozzler.Site(None, {"seed": url})
    assert brozzler.is_permitted_by_robots(site, url)

+
 def test_scoping():
-    test_scope = yaml.safe_load('''
+    test_scope = yaml.safe_load(
+        """
 max_hops: 100
 accepts:
 - url_match: REGEX_MATCH
@ -169,40 +205,73 @@ blocks:
 - domain: twitter.com
  url_match: REGEX_MATCH
  value: ^.*lang=(?!en).*$
-''')
+"""
+    )

-    site = brozzler.Site(None, {
-        'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
-        'scope': test_scope})
-    page = brozzler.Page(None, {
-        'url': 'http://example.com/foo/bar?baz=quux#monkey',
-        'site_id': site.id})
+    site = brozzler.Site(
+        None,
+        {
+            "id": 1,
+            "seed": "http://example.com/foo/bar?baz=quux#monkey",
+            "scope": test_scope,
+        },
+    )
+    page = brozzler.Page(
+        None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
+    )

-    assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
-    assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
+    assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
+    assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None

-    assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
-    assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
+    assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
+    assert (
+        site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
+        is True
+    )

-    assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
-    assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
+    assert (
+        site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
+    )
+    assert (
+        site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
+    )

-    assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
-    assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
-    assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
+    assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
+    assert (
+        site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
+    )
+    assert (
+        site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
+    )

-    assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
+    assert (
+        site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
+    )
+
+    assert (
+        site.accept_reject_or_neither(
+            "https://www.youtube.com/watch?v=dUIn5OAPS5s", page
+        )
+        is None
+    )
+    yt_user_page = brozzler.Page(
+        None,
+        {
+            "url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
+            "site_id": site.id,
+            "hops_from_seed": 10,
+        },
+    )
+    assert (
+        site.accept_reject_or_neither(
+            "https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
+        )
+        is True
+    )

-    assert site.accept_reject_or_neither(
-            'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
-    yt_user_page = brozzler.Page(None, {
-        'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
-        'site_id': site.id, 'hops_from_seed': 10})
-    assert site.accept_reject_or_neither(
-            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True

 def test_proxy_down():
-    '''
+    """
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

    This test needs to cover every possible fetch through the proxy other than
@ -211,24 +280,24 @@ def test_proxy_down():
    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
-    '''
+    """
    sock = socket.socket()
-    sock.bind(('127.0.0.1', 0))
-    for not_listening_proxy in (
-            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
-        worker = brozzler.BrozzlerWorker(
-                frontier=None, proxy=not_listening_proxy)
-        site = brozzler.Site(None, {
-            'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
-        page = brozzler.Page(None, {'url': 'http://example.com/'})
+    sock.bind(("127.0.0.1", 0))
+    for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
+        worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
+        site = brozzler.Site(
+            None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
+        )
+        page = brozzler.Page(None, {"url": "http://example.com/"})

        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(
-                    site, 'http://example.com/', proxy=not_listening_proxy)
+                site, "http://example.com/", proxy=not_listening_proxy
+            )

        # youtube-dl fetch
-        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+        with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
            with pytest.raises(brozzler.ProxyError):
                brozzler.ydl.do_youtube_dl(worker, site, page)

@ -239,47 +308,58 @@ def test_proxy_down():
        # WARCPROX_WRITE_RECORD
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
-                    warcprox_address=not_listening_proxy,
-                    url='test://proxy_down/warcprox_write_record',
-                    warc_type='metadata',
-                    content_type='text/plain',
-                    payload=b'''payload doesn't matter here''')
+                warcprox_address=not_listening_proxy,
+                url="test://proxy_down/warcprox_write_record",
+                warc_type="metadata",
+                content_type="text/plain",
+                payload=b"""payload doesn't matter here""",
+            )
+

 def test_start_stop_backwards_compat():
-    site = brozzler.Site(None, {'seed': 'http://example.com/'})
+    site = brozzler.Site(None, {"seed": "http://example.com/"})
    assert len(site.starts_and_stops) == 1
-    assert site.starts_and_stops[0]['start']
-    assert site.starts_and_stops[0]['stop'] is None
-    assert not 'start_time' in site
+    assert site.starts_and_stops[0]["start"]
+    assert site.starts_and_stops[0]["stop"] is None
+    assert not "start_time" in site

-    site = brozzler.Site(None, {
-        'seed': 'http://example.com/',
-        'start_time': datetime.datetime(2017,1,1)})
+    site = brozzler.Site(
+        None,
+        {"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
+    )
    assert len(site.starts_and_stops) == 1
-    assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
-    assert site.starts_and_stops[0]['stop'] is None
-    assert not 'start_time' in site
+    assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
+    assert site.starts_and_stops[0]["stop"] is None
+    assert not "start_time" in site

-    job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
-    assert job.starts_and_stops[0]['start']
-    assert job.starts_and_stops[0]['stop'] is None
-    assert not 'started' in job
-    assert not 'finished' in job
+    job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
+    assert job.starts_and_stops[0]["start"]
+    assert job.starts_and_stops[0]["stop"] is None
+    assert not "started" in job
+    assert not "finished" in job
+
+    job = brozzler.Job(
+        None,
+        {
+            "seeds": [{"url": "https://example.com/"}],
+            "started": datetime.datetime(2017, 1, 1),
+            "finished": datetime.datetime(2017, 1, 2),
+        },
+    )
+    assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
+    assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
+    assert not "started" in job
+    assert not "finished" in job

-    job = brozzler.Job(None, {
-        'seeds': [{'url':'https://example.com/'}],
-        'started': datetime.datetime(2017, 1, 1),
-        'finished': datetime.datetime(2017, 1, 2)})
-    assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
-    assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
-    assert not 'started' in job
-    assert not 'finished' in job

 class Exception1(Exception):
    pass
+
+
 class Exception2(Exception):
    pass

+
 def test_thread_raise_not_accept():
    def never_accept():
        try:
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
    th.join()
    assert thread_caught_exception is None

+
 def test_thread_raise_immediate():
    def accept_immediately():
        try:
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
    assert isinstance(thread_caught_exception, Exception1)
    assert time.time() - start < 1.0

+
 def test_thread_raise_safe_exit():
    def delay_context_exit():
        gate = brozzler.thread_accept_exceptions()
        orig_exit = type(gate).__exit__
        try:
            type(gate).__exit__ = lambda self, et, ev, t: (
-                    brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
+                brozzler.sleep(2),
+                orig_exit(self, et, ev, t),
+                False,
+            )[-1]
            with brozzler.thread_accept_exceptions() as gate:
                brozzler.sleep(2)
        except Exception as e:
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
    assert thread_caught_exception
    assert isinstance(thread_caught_exception, Exception1)

+
 def test_thread_raise_pending_exception():
    def accept_eventually():
        try:
@ -365,16 +451,17 @@ def test_thread_raise_pending_exception():
    assert isinstance(thread_caught_exception, Exception1)
    assert time.time() - start > 1.0

+
 def test_thread_raise_second_with_block():
    def two_with_blocks():
        try:
            with brozzler.thread_accept_exceptions():
                time.sleep(2)
-            return # test fails
+            return  # test fails
        except Exception1 as e:
            pass
        except:
-            return # fail test
+            return  # fail test

        try:
            with brozzler.thread_accept_exceptions():
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
    th.join()
    assert isinstance(thread_caught_exception, Exception2)

+
 def test_needs_browsing():
    # only one test case here right now, which exposed a bug

    class ConvenientHeaders(http.client.HTTPMessage):
        def __init__(self, headers):
            http.client.HTTPMessage.__init__(self)
-            for (k, v) in headers.items():
+            for k, v in headers.items():
                self.add_header(k, v)

-    page = brozzler.Page(None, {
-        'url':'http://example.com/a'})
+    page = brozzler.Page(None, {"url": "http://example.com/a"})

    spy = brozzler.ydl.YoutubeDLSpy()
-    spy.fetches.append({
-        'url': 'http://example.com/a',
-        'method': 'HEAD',
-        'response_code': 301,
-        'response_headers': ConvenientHeaders({'Location': '/b'})})
-    spy.fetches.append({
-        'url': 'http://example.com/b',
-        'method': 'GET',
-        'response_code': 200,
-        'response_headers': ConvenientHeaders({
-            'Content-Type': 'application/pdf'})})
+    spy.fetches.append(
+        {
+            "url": "http://example.com/a",
+            "method": "HEAD",
+            "response_code": 301,
+            "response_headers": ConvenientHeaders({"Location": "/b"}),
+        }
+    )
+    spy.fetches.append(
+        {
+            "url": "http://example.com/b",
+            "method": "GET",
+            "response_code": 200,
+            "response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
+        }
+    )
+
+    assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)

-    assert not brozzler.worker.BrozzlerWorker._needs_browsing(
-            None, page, spy.fetches)

 def test_seed_redirect():
-    site = brozzler.Site(None, {'seed': 'http://foo.com/'})
-    site.note_seed_redirect('https://foo.com/a/b/c')
-    assert site.scope == {'accepts': [
-        {'ssurt': 'com,foo,//http:/',},
-        {'ssurt': 'com,foo,//https:/',}]}
+    site = brozzler.Site(None, {"seed": "http://foo.com/"})
+    site.note_seed_redirect("https://foo.com/a/b/c")
+    assert site.scope == {
+        "accepts": [
+            {
+                "ssurt": "com,foo,//http:/",
+            },
+            {
+                "ssurt": "com,foo,//https:/",
+            },
+        ]
+    }

-    site = brozzler.Site(None, {'seed': 'https://foo.com/'})
-    site.note_seed_redirect('http://foo.com/a/b/c')
-    assert site.scope == {'accepts': [
-        {'ssurt': 'com,foo,//https:/',},
-        {'ssurt': 'com,foo,//http:/',}]}
+    site = brozzler.Site(None, {"seed": "https://foo.com/"})
+    site.note_seed_redirect("http://foo.com/a/b/c")
+    assert site.scope == {
+        "accepts": [
+            {
+                "ssurt": "com,foo,//https:/",
+            },
+            {
+                "ssurt": "com,foo,//http:/",
+            },
+        ]
+    }
+
+    site = brozzler.Site(None, {"seed": "http://foo.com/"})
+    site.note_seed_redirect("https://bar.com/a/b/c")
+    assert site.scope == {
+        "accepts": [
+            {
+                "ssurt": "com,foo,//http:/",
+            },
+            {
+                "ssurt": "com,bar,//https:/a/b/c",
+            },
+        ]
+    }

-    site = brozzler.Site(None, {'seed': 'http://foo.com/'})
-    site.note_seed_redirect('https://bar.com/a/b/c')
-    assert site.scope == {'accepts': [
-        {'ssurt': 'com,foo,//http:/',},
-        {'ssurt': 'com,bar,//https:/a/b/c',}]}

 def test_limit_failures():
    page = mock.Mock()
@ -446,9 +560,9 @@ def test_limit_failures():
    page.brozzle_count = 0

    site = mock.Mock()
-    site.status = 'ACTIVE'
+    site.status = "ACTIVE"
    site.active_brozzling_time = 0
-    site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
+    site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]

    rr = mock.Mock()
    rr.servers = [mock.Mock()]
@ -456,11 +570,12 @@ def test_limit_failures():
    rr.db_list = mock.Mock(return_value=rethink_query)
    rr.table_list = mock.Mock(return_value=rethink_query)
    rr.table = mock.Mock(
-            return_value=mock.Mock(
-                between=mock.Mock(
-                    return_value=mock.Mock(
-                        limit=mock.Mock(
-                            return_value=rethink_query)))))
+        return_value=mock.Mock(
+            between=mock.Mock(
+                return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
+            )
+        )
+    )
    assert rr.table().between().limit().run() == []
    frontier = brozzler.RethinkDbFrontier(rr)
    frontier.enforce_time_limit = mock.Mock()
@ -475,20 +590,19 @@ def test_limit_failures():

    assert page.failed_attempts is None
    assert page.brozzle_count == 0
-    assert site.status == 'ACTIVE'
+    assert site.status == "ACTIVE"

    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 1
    assert page.brozzle_count == 0
-    assert site.status == 'ACTIVE'
+    assert site.status == "ACTIVE"

    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 2
    assert page.brozzle_count == 0
-    assert site.status == 'ACTIVE'
+    assert site.status == "ACTIVE"

    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 3
    assert page.brozzle_count == 1
-    assert site.status == 'FINISHED'
-
+    assert site.status == "FINISHED"
--- a/vagrant/vagrant-brozzler-new-job.py
+++ b/vagrant/vagrant-brozzler-new-job.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
 queue a job for your vagrant brozzler deployment.

@ -20,30 +20,39 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import sys
 import os
 import argparse
 import subprocess

+
 def main(argv=[]):
    arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
    arg_parser.add_argument(
-            'job_conf_file', metavar='JOB_CONF_FILE',
-            help='brozzler job configuration file in yaml')
+        "job_conf_file",
+        metavar="JOB_CONF_FILE",
+        help="brozzler job configuration file in yaml",
+    )
    args = arg_parser.parse_args(args=argv[1:])

    # cd to path with Vagrantfile so "vagrant ssh" knows what to do
    os.chdir(os.path.dirname(__file__))

-    with open(args.job_conf_file, 'rb') as f:
-        subprocess.call([
-            'vagrant', 'ssh', '--',
-            'f=`mktemp` && cat > $f && '
-            '/home/vagrant/brozzler-ve3/bin/python '
-            '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
-            stdin=f)
+    with open(args.job_conf_file, "rb") as f:
+        subprocess.call(
+            [
+                "vagrant",
+                "ssh",
+                "--",
+                "f=`mktemp` && cat > $f && "
+                "/home/vagrant/brozzler-ve3/bin/python "
+                "/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f",
+            ],
+            stdin=f,
+        )

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main(sys.argv)
--- a/vagrant/vagrant-brozzler-new-site.py
+++ b/vagrant/vagrant-brozzler-new-site.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
 queue a site for your vagrant brozzler deployment.

@ -23,61 +23,69 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""

 import sys
 import os
 import argparse
 import subprocess
+
 try:
    from shlex import quote
 except:
    from pipes import quote

+
 def main(argv=[]):
    arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
-    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
+    arg_parser.add_argument("seed", metavar="SEED", help="seed url")
    arg_parser.add_argument(
-            '--time-limit', dest='time_limit', default=None,
-            help='time limit in seconds for this site')
+        "--time-limit",
+        dest="time_limit",
+        default=None,
+        help="time limit in seconds for this site",
+    )
    arg_parser.add_argument(
-            '--ignore-robots', dest='ignore_robots', action='store_true',
-            help='ignore robots.txt for this site')
+        "--ignore-robots",
+        dest="ignore_robots",
+        action="store_true",
+        help="ignore robots.txt for this site",
+    )
    arg_parser.add_argument(
-            '--warcprox-meta', dest='warcprox_meta',
-            help=(
-                'Warcprox-Meta http request header to send with each request; '
-                'must be a json blob, ignored unless warcprox features are '
-                'enabled'))
-    arg_parser.add_argument(
-            '-q', '--quiet', dest='quiet', action='store_true')
-    arg_parser.add_argument(
-            '-v', '--verbose', dest='verbose', action='store_true')
+        "--warcprox-meta",
+        dest="warcprox_meta",
+        help=(
+            "Warcprox-Meta http request header to send with each request; "
+            "must be a json blob, ignored unless warcprox features are "
+            "enabled"
+        ),
+    )
+    arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true")
+    arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")

    args = arg_parser.parse_args(args=argv[1:])

    options = []
    if args.time_limit:
-        options.append('--time-limit=%s' % args.time_limit)
+        options.append("--time-limit=%s" % args.time_limit)
    if args.ignore_robots:
-        options.append('--ignore-robots')
+        options.append("--ignore-robots")
    if args.warcprox_meta:
        # I think this shell escaping is correct?
-        options.append(
-                '--warcprox-meta=%s' % quote(args.warcprox_meta))
+        options.append("--warcprox-meta=%s" % quote(args.warcprox_meta))
    if args.quiet:
-        options.append('--quiet')
+        options.append("--quiet")
    if args.verbose:
-        options.append('--verbose')
+        options.append("--verbose")

    # cd to path with Vagrantfile so "vagrant ssh" knows what to do
    os.chdir(os.path.dirname(__file__))

    cmd = (
-        '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
-        '%s %s') % (' '.join(options), args.seed)
-    subprocess.call(['vagrant', 'ssh', '--', cmd])
+        "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
+    ) % (" ".join(options), args.seed)
+    subprocess.call(["vagrant", "ssh", "--", cmd])

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main(sys.argv)
-