Merge branch 'blacked' into qa

2025-07-21 14:09:00 -04:00 · 2024-02-08 12:31:39 -08:00 · 2024-02-08 12:31:39 -08:00 · 8f14dc1aec
commit 8f14dc1aec
parent ccd7c84864 8b23430a87
23 changed files with 4048 additions and 2796 deletions
--- a/.github/workflows/python-formatting.yml
+++ b/.github/workflows/python-formatting.yml
@ -0,0 +1,31 @@
 name: Python Formatting Check
 on:
  push:
    branches:
      - main
      - master
  pull_request:
    branches:
      - main
      - master
 jobs:
  formatting:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python 3.8
      uses: actions/setup-python@v5
      with:
        python-version: '3.8'
    - name: Create virtual environment
      run: python -m venv venv
    - name: Install black
      run: |
        ./venv/bin/pip install --upgrade pip
        ./venv/bin/pip install black
    - name: Run formatting check
      run: make ck-format
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,5 @@
 *.diff
 .*.sw*
 /brozzler.egg-info/
 venv
 .idea
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 .PHONY: format
 format:
 	venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
 .PHONY: ck-format
 ck-format:
 	venv/bin/black --check .
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -19,33 +19,41 @@ limitations under the License.
 import logging
 from pkg_resources import get_distribution as _get_distribution
-__version__ = _get_distribution('brozzler').version
+
 __version__ = _get_distribution("brozzler").version
 class ShutdownRequested(Exception):
    pass
 class NothingToClaim(Exception):
    pass
 class CrawlStopped(Exception):
    pass
 class PageInterstitialShown(Exception):
    pass
 class ProxyError(Exception):
    pass
 class ReachedTimeLimit(Exception):
    pass
 class ReachedLimit(Exception):
    def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
        import json
        if http_error:
            if "warcprox-meta" in http_error.headers:
-                self.warcprox_meta = json.loads(
+                self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"])
                        http_error.headers["warcprox-meta"])
            else:
                self.warcprox_meta = None
            self.http_payload = http_error.read()
@ -55,28 +63,39 @@ class ReachedLimit(Exception):
    def __repr__(self):
        return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
-                self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
+            self.warcprox_meta if hasattr(self, "warcprox_meta") else None,
-                self.http_payload if hasattr(self, 'http_payload') else None)
+            self.http_payload if hasattr(self, "http_payload") else None,
        )
    def __str__(self):
        return self.__repr__()
 # monkey-patch log levels TRACE and NOTICE
 logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
 def _logger_trace(self, msg, *args, **kwargs):
    if self.isEnabledFor(logging.TRACE):
        self._log(logging.TRACE, msg, args, **kwargs)
 logging.Logger.trace = _logger_trace
 logging.trace = logging.root.trace
-logging.addLevelName(logging.TRACE, 'TRACE')
+logging.addLevelName(logging.TRACE, "TRACE")
 logging.NOTICE = (logging.INFO + logging.WARN) // 2
 def _logger_notice(self, msg, *args, **kwargs):
    if self.isEnabledFor(logging.NOTICE):
        self._log(logging.NOTICE, msg, args, **kwargs)
 logging.Logger.notice = _logger_notice
 logging.notice = logging.root.notice
-logging.addLevelName(logging.NOTICE, 'NOTICE')
+logging.addLevelName(logging.NOTICE, "NOTICE")
 # see https://github.com/internetarchive/brozzler/issues/91
 def _logging_handler_handle(self, record):
@ -91,9 +110,13 @@ def _logging_handler_handle(self, record):
            except:
                pass
    return rv
 logging.Handler.handle = _logging_handler_handle
 _behaviors = None
 def behaviors(behaviors_dir=None):
    """Return list of JS behaviors loaded from YAML file.
@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None):
    `js-templates/`. Defaults to brozzler dir.
    """
    import os, yaml, string
    global _behaviors
    if _behaviors is None:
        d = behaviors_dir or os.path.dirname(__file__)
-        behaviors_yaml = os.path.join(d, 'behaviors.yaml')
+        behaviors_yaml = os.path.join(d, "behaviors.yaml")
        with open(behaviors_yaml) as fin:
            _behaviors = yaml.safe_load(fin)
    return _behaviors
 def behavior_script(url, template_parameters=None, behaviors_dir=None):
-    '''
+    """
    Returns the javascript behavior string populated with template_parameters.
-    '''
+    """
    import re, logging, json
    for behavior in behaviors(behaviors_dir=behaviors_dir):
-        if re.match(behavior['url_regex'], url):
+        if re.match(behavior["url_regex"], url):
            parameters = dict()
-            if 'default_parameters' in behavior:
+            if "default_parameters" in behavior:
-                parameters.update(behavior['default_parameters'])
+                parameters.update(behavior["default_parameters"])
            if template_parameters:
                parameters.update(template_parameters)
            template = jinja2_environment(behaviors_dir).get_template(
-                    behavior['behavior_js_template'])
+                behavior["behavior_js_template"]
            )
            script = template.render(parameters)
            logging.info(
-                    'using template=%r populated with parameters=%r for %r',
+                "using template=%r populated with parameters=%r for %r",
-                    behavior['behavior_js_template'], json.dumps(parameters), url)
+                behavior["behavior_js_template"],
                json.dumps(parameters),
                url,
            )
            return script
    return None
 class ThreadExceptionGate:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -142,8 +173,7 @@ class ThreadExceptionGate:
    def __enter__(self):
        assert self.thread == threading.current_thread()
        if self.pending_exception:
-            self.logger.info(
+            self.logger.info("raising pending exception %s", self.pending_exception)
                    'raising pending exception %s', self.pending_exception)
            tmp = self.pending_exception
            self.pending_exception = None
            raise tmp
@ -160,19 +190,26 @@ class ThreadExceptionGate:
        with self.lock:
            if self.pending_exception:
                self.logger.warning(
-                        '%r already pending for thread %r, discarding %r',
+                    "%r already pending for thread %r, discarding %r",
-                        self.pending_exception, self.thread, e)
+                    self.pending_exception,
                    self.thread,
                    e,
                )
            else:
                self.pending_exception = e
    def __repr__(self):
-        return '<ThreadExceptionGate(%s)>' % self.thread
+        return "<ThreadExceptionGate(%s)>" % self.thread
 import threading
 _thread_exception_gates = {}
 _thread_exception_gates_lock = threading.Lock()
 def thread_exception_gate(thread=None):
-    '''
+    """
    Returns a `ThreadExceptionGate` for `thread` (current thread by default).
    `ThreadExceptionGate` is a context manager which allows exceptions to be
@ -191,7 +228,7 @@ def thread_exception_gate(thread=None):
    is queued, and raised immediately if and when the thread enters the
    context. Only one exception will be queued this way at a time, others are
    discarded.
-    '''
+    """
    if not thread:
        thread = threading.current_thread()
@ -201,10 +238,12 @@ def thread_exception_gate(thread=None):
    return _thread_exception_gates[thread]
 thread_accept_exceptions = thread_exception_gate
 def thread_raise(thread, exctype):
-    '''
+    """
    Raises or queues the exception `exctype` for the thread `thread`.
    See the documentation on the function `thread_exception_gate()` for more
@ -218,40 +257,43 @@ def thread_raise(thread, exctype):
    Raises:
        TypeError if `exctype` is not a class
        ValueError, SystemError in case of unexpected problems
-    '''
+    """
    import ctypes, inspect, threading, logging
    if not inspect.isclass(exctype):
        raise TypeError(
-                'cannot raise %s, only exception types can be raised (not '
+            "cannot raise %s, only exception types can be raised (not "
-                'instances)' % exctype)
+            "instances)" % exctype
        )
    gate = thread_exception_gate(thread)
    with gate.lock:
        if gate.ok_to_raise.is_set() and thread.is_alive():
            gate.ok_to_raise.clear()
-            logging.info('raising %s in thread %s', exctype, thread)
+            logging.info("raising %s in thread %s", exctype, thread)
            res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
-                    ctypes.c_long(thread.ident), ctypes.py_object(exctype))
+                ctypes.c_long(thread.ident), ctypes.py_object(exctype)
            )
            if res == 0:
-                raise ValueError(
+                raise ValueError("invalid thread id? thread.ident=%s" % thread.ident)
                        'invalid thread id? thread.ident=%s' % thread.ident)
            elif res != 1:
                # if it returns a number greater than one, you're in trouble,
                # and you should call it again with exc=NULL to revert the effect
                ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
-                raise SystemError('PyThreadState_SetAsyncExc failed')
+                raise SystemError("PyThreadState_SetAsyncExc failed")
        else:
-            logging.info('queueing %s for thread %s', exctype, thread)
+            logging.info("queueing %s for thread %s", exctype, thread)
            gate.queue_exception(exctype)
 def sleep(duration):
-    '''
+    """
    Sleeps for duration seconds in increments of 0.5 seconds.
    Use this so that the sleep can be interrupted by thread_raise().
-    '''
+    """
    import time
    start = time.time()
    while True:
        elapsed = time.time() - start
@ -259,32 +301,41 @@ def sleep(duration):
            break
        time.sleep(min(duration - elapsed, 0.5))
 _jinja2_env = None
 def jinja2_environment(behaviors_dir=None):
    global _jinja2_env
    if not _jinja2_env:
        import os, jinja2, json
        if behaviors_dir:
-            _loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
+            _loader = jinja2.FileSystemLoader(
-                                                           'js-templates'))
+                os.path.join(behaviors_dir, "js-templates")
            )
        else:
-            _loader=jinja2.PackageLoader('brozzler', 'js-templates')
+            _loader = jinja2.PackageLoader("brozzler", "js-templates")
        _jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False)
-        _jinja2_env.filters['json'] = json.dumps
+        _jinja2_env.filters["json"] = json.dumps
    return _jinja2_env
 import urlcanon
 def _remove_query(url):
-    url.question_mark = b''
+    url.question_mark = b""
-    url.query = b''
+    url.query = b""
 # XXX chop off path after last slash??
-site_surt_canon = urlcanon.Canonicalizer(
+site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
        urlcanon.semantic.steps + [_remove_query])
 import doublethink
 import datetime
-EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
+
-        tzinfo=doublethink.UTC)
+EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
 # we could make this configurable if there's a good reason
 MAX_PAGE_FAILURES = 3
@ -294,10 +345,31 @@ from brozzler.robots import is_permitted_by_robots
 from brozzler.frontier import RethinkDbFrontier
 from brozzler.browser import Browser, BrowserPool, BrowsingException
 from brozzler.model import (
-        new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf)
+    new_job,
    new_job_file,
    new_site,
    Job,
    Page,
    Site,
    InvalidJobConf,
 )
 from brozzler.cli import suggest_default_chrome_exe
-__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
+__all__ = [
-           'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
+    "Page",
-           'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf',
+    "Site",
-           'sleep', 'thread_accept_exceptions', 'thread_raise']
+    "BrozzlerWorker",
    "is_permitted_by_robots",
    "RethinkDbFrontier",
    "Browser",
    "BrowserPool",
    "BrowsingException",
    "new_job",
    "new_site",
    "Job",
    "new_job_file",
    "InvalidJobConf",
    "sleep",
    "thread_accept_exceptions",
    "thread_raise",
 ]
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/browser.py - manages the browsers for brozzler
 Copyright (C) 2014-2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import logging
 import time
@ -33,30 +33,35 @@ from brozzler.chrome import Chrome
 import socket
 import urlcanon
 class BrowsingException(Exception):
    pass
 class NoBrowsersAvailable(Exception):
    pass
 class BrowsingTimeout(BrowsingException):
    pass
 class BrowserPool:
-    '''
+    """
    Manages pool of browsers. Automatically chooses available port for the
    debugging protocol.
-    '''
+    """
-    logger = logging.getLogger(__module__ + '.' + __qualname__)
+
    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self, size=3, **kwargs):
-        '''
+        """
        Initializes the pool.
        Args:
            size: size of pool (default 3)
            **kwargs: arguments for Browser(...)
-        '''
+        """
        self.size = size
        self.kwargs = kwargs
        self._in_use = set()
@ -65,7 +70,7 @@ class BrowserPool:
    def _fresh_browser(self):
        # choose available port
        sock = socket.socket()
-        sock.bind(('0.0.0.0', 0))
+        sock.bind(("0.0.0.0", 0))
        port = sock.getsockname()[1]
        sock.close()
@ -73,12 +78,12 @@ class BrowserPool:
        return browser
    def acquire_multi(self, n=1):
-        '''
+        """
        Returns a list of up to `n` browsers.
        Raises:
            NoBrowsersAvailable if none available
-        '''
+        """
        browsers = []
        with self._lock:
            if len(self._in_use) >= self.size:
@ -90,7 +95,7 @@ class BrowserPool:
        return browsers
    def acquire(self):
-        '''
+        """
        Returns an available instance.
        Returns:
@ -98,7 +103,7 @@ class BrowserPool:
        Raises:
            NoBrowsersAvailable if none available
-        '''
+        """
        with self._lock:
            if len(self._in_use) >= self.size:
                raise NoBrowsersAvailable
@ -120,8 +125,8 @@ class BrowserPool:
    def shutdown_now(self):
        self.logger.info(
-                'shutting down browser pool (%s browsers in use)',
+            "shutting down browser pool (%s browsers in use)", len(self._in_use)
-                len(self._in_use))
+        )
        with self._lock:
            for browser in self._in_use:
                browser.stop()
@ -132,8 +137,9 @@ class BrowserPool:
    def num_in_use(self):
        return len(self._in_use)
 class WebsockReceiverThread(threading.Thread):
-    logger = logging.getLogger(__module__ + '.' + __qualname__)
+    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self, websock, name=None, daemon=True):
        super().__init__(name=name, daemon=daemon)
@ -175,50 +181,54 @@ class WebsockReceiverThread(threading.Thread):
        self.is_open = True
    def _on_error(self, websock, e):
-        '''
+        """
        Raises BrowsingException in the thread that created this instance.
-        '''
+        """
-        if isinstance(e, (
+        if isinstance(
-            websocket.WebSocketConnectionClosedException,
+            e, (websocket.WebSocketConnectionClosedException, ConnectionResetError)
-            ConnectionResetError)):
+        ):
-            self.logger.error('websocket closed, did chrome die?')
+            self.logger.error("websocket closed, did chrome die?")
        else:
-            self.logger.error(
+            self.logger.error("exception from websocket receiver thread", exc_info=1)
                    'exception from websocket receiver thread',
                    exc_info=1)
        brozzler.thread_raise(self.calling_thread, BrowsingException)
    def run(self):
        # ping_timeout is used as the timeout for the call to select.select()
        # in addition to its documented purpose, and must have a value to avoid
        # hangs in certain situations
-        self.websock.run_forever(sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),),
+        self.websock.run_forever(
-                                 ping_timeout=0.5)
+            sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5
        )
    def _on_message(self, websock, message):
        try:
            self._handle_message(websock, message)
        except:
            self.logger.error(
-                    'uncaught exception in _handle_message message=%s',
+                "uncaught exception in _handle_message message=%s",
-                    message, exc_info=True)
+                message,
                exc_info=True,
            )
    def _network_response_received(self, message):
-        status = message['params']['response'].get('status')
+        status = message["params"]["response"].get("status")
-        if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict(
+        if status == 420 and "Warcprox-Meta" in CaseInsensitiveDict(
-                message['params']['response']['headers'])):
+            message["params"]["response"]["headers"]
        ):
            if not self.reached_limit:
-                warcprox_meta = json.loads(CaseInsensitiveDict(
+                warcprox_meta = json.loads(
-                    message['params']['response']['headers'])['Warcprox-Meta'])
+                    CaseInsensitiveDict(message["params"]["response"]["headers"])[
-                self.reached_limit = brozzler.ReachedLimit(
+                        "Warcprox-Meta"
-                        warcprox_meta=warcprox_meta)
+                    ]
-                self.logger.info('reached limit %s', self.reached_limit)
+                )
-                brozzler.thread_raise(
+                self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
-                        self.calling_thread, brozzler.ReachedLimit)
+                self.logger.info("reached limit %s", self.reached_limit)
                brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
            else:
                self.logger.info(
-                        'reached limit but self.reached_limit is already set, '
+                    "reached limit but self.reached_limit is already set, "
-                        'assuming the calling thread is already handling this')
+                    "assuming the calling thread is already handling this"
                )
        if self.on_response:
            self.on_response(message)
@ -226,75 +236,92 @@ class WebsockReceiverThread(threading.Thread):
            self.page_status = status
    def _javascript_dialog_opening(self, message):
-        self.logger.info('javascript dialog opened: %s', message)
+        self.logger.info("javascript dialog opened: %s", message)
-        if message['params']['type'] == 'alert':
+        if message["params"]["type"] == "alert":
            accept = True
        else:
            accept = False
        self.websock.send(
-                json.dumps(dict(
+            json.dumps(
-                    id=0, method='Page.handleJavaScriptDialog',
+                dict(
-                    params={'accept': accept}), separators=',:'))
+                    id=0,
                    method="Page.handleJavaScriptDialog",
                    params={"accept": accept},
                ),
                separators=",:",
            )
        )
    def _handle_message(self, websock, json_message):
        message = json.loads(json_message)
-        if 'method' in message:
+        if "method" in message:
-            if message['method'] == 'Page.loadEventFired':
+            if message["method"] == "Page.loadEventFired":
                self.got_page_load_event = datetime.datetime.utcnow()
-            elif message['method'] == 'Network.responseReceived':
+            elif message["method"] == "Network.responseReceived":
                self._network_response_received(message)
-            elif message['method'] == 'Network.requestWillBeSent':
+            elif message["method"] == "Network.requestWillBeSent":
                if self.on_request:
                    self.on_request(message)
-            elif message['method'] == 'Page.interstitialShown':
+            elif message["method"] == "Page.interstitialShown":
                # AITFIVE-1529: handle http auth
                # we should kill the browser when we receive Page.interstitialShown and
                # consider the page finished, until this is fixed:
                # https://bugs.chromium.org/p/chromium/issues/detail?id=764505
-                self.logger.info('Page.interstialShown (likely unsupported http auth request)')
+                self.logger.info(
-                brozzler.thread_raise(self.calling_thread, brozzler.PageInterstitialShown)
+                    "Page.interstialShown (likely unsupported http auth request)"
-            elif message['method'] == 'Inspector.targetCrashed':
+                )
-                self.logger.error(
+                brozzler.thread_raise(
-                        '''chrome tab went "aw snap" or "he's dead jim"!''')
+                    self.calling_thread, brozzler.PageInterstitialShown
                )
            elif message["method"] == "Inspector.targetCrashed":
                self.logger.error("""chrome tab went "aw snap" or "he's dead jim"!""")
                brozzler.thread_raise(self.calling_thread, BrowsingException)
-            elif message['method'] == 'Console.messageAdded':
+            elif message["method"] == "Console.messageAdded":
                self.logger.debug(
-                        'console.%s %s', message['params']['message']['level'],
+                    "console.%s %s",
-                        message['params']['message']['text'])
+                    message["params"]["message"]["level"],
-            elif message['method'] == 'Runtime.exceptionThrown':
+                    message["params"]["message"]["text"],
-                self.logger.debug('uncaught exception: %s', message)
+                )
-            elif message['method'] == 'Page.javascriptDialogOpening':
+            elif message["method"] == "Runtime.exceptionThrown":
                self.logger.debug("uncaught exception: %s", message)
            elif message["method"] == "Page.javascriptDialogOpening":
                self._javascript_dialog_opening(message)
-            elif (message['method'] == 'Network.loadingFailed'
+            elif (
-                    and 'params' in message and 'errorText' in message['params']
+                message["method"] == "Network.loadingFailed"
-                    and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
+                and "params" in message
                and "errorText" in message["params"]
                and message["params"]["errorText"] == "net::ERR_PROXY_CONNECTION_FAILED"
            ):
                brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
-            elif message['method'] == 'ServiceWorker.workerVersionUpdated':
+            elif message["method"] == "ServiceWorker.workerVersionUpdated":
                if self.on_service_worker_version_updated:
                    self.on_service_worker_version_updated(message)
            # else:
            #     self.logger.debug("%s %s", message["method"], json_message)
-        elif 'result' in message:
+        elif "result" in message:
-            if message['id'] in self._result_messages:
+            if message["id"] in self._result_messages:
-                self._result_messages[message['id']] = message
+                self._result_messages[message["id"]] = message
    #      else:
    #          self.logger.debug("%s", json_message)
    #  else:
    #      self.logger.debug("%s", json_message)
 class Browser:
-    '''
+    """
    Manages an instance of Chrome for browsing pages.
-    '''
+    """
-    logger = logging.getLogger(__module__ + '.' + __qualname__)
+
    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self, **kwargs):
-        '''
+        """
        Initializes the Browser.
        Args:
            **kwargs: arguments for Chrome(...)
-        '''
+        """
        self.chrome = Chrome(**kwargs)
        self.websock_url = None
        self.websock = None
@ -311,9 +338,9 @@ class Browser:
        self.stop()
    def _wait_for(self, callback, timeout=None):
-        '''
+        """
        Spins until callback() returns truthy.
-        '''
+        """
        start = time.time()
        while True:
            if callback():
@ -321,112 +348,140 @@ class Browser:
            elapsed = time.time() - start
            if timeout and elapsed > timeout:
                raise BrowsingTimeout(
-                        'timed out after %.1fs waiting for: %s' % (
+                    "timed out after %.1fs waiting for: %s" % (elapsed, callback)
-                            elapsed, callback))
+                )
            brozzler.sleep(self._wait_interval)
    def send_to_chrome(self, suppress_logging=False, **kwargs):
        msg_id = next(self._command_id)
-        kwargs['id'] = msg_id
+        kwargs["id"] = msg_id
-        msg = json.dumps(kwargs, separators=',:')
+        msg = json.dumps(kwargs, separators=",:")
        logging.log(
            logging.TRACE if suppress_logging else logging.DEBUG,
-                'sending message to %s: %s', self.websock, msg)
+            "sending message to %s: %s",
            self.websock,
            msg,
        )
        self.websock.send(msg)
        return msg_id
    def start(self, **kwargs):
-        '''
+        """
        Starts chrome if it's not running.
        Args:
            **kwargs: arguments for self.chrome.start(...)
-        '''
+        """
        if not self.is_running():
            self.websock_url = self.chrome.start(**kwargs)
            self.websock = websocket.WebSocketApp(self.websock_url)
            self.websock_thread = WebsockReceiverThread(
-                    self.websock, name='WebsockThread:%s' % self.chrome.port)
+                self.websock, name="WebsockThread:%s" % self.chrome.port
            )
            self.websock_thread.start()
            self._wait_for(lambda: self.websock_thread.is_open, timeout=30)
            # tell browser to send us messages we're interested in
-            self.send_to_chrome(method='Network.enable')
+            self.send_to_chrome(method="Network.enable")
-            self.send_to_chrome(method='Page.enable')
+            self.send_to_chrome(method="Page.enable")
            # Enable Console & Runtime output only when debugging.
            # After all, we just print these events with debug(), we don't use
            # them in Brozzler logic.
            if self.logger.isEnabledFor(logging.DEBUG):
-                self.send_to_chrome(method='Console.enable')
+                self.send_to_chrome(method="Console.enable")
-                self.send_to_chrome(method='Runtime.enable')
+                self.send_to_chrome(method="Runtime.enable")
-            self.send_to_chrome(method='ServiceWorker.enable')
+            self.send_to_chrome(method="ServiceWorker.enable")
-            self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad')
+            self.send_to_chrome(method="ServiceWorker.setForceUpdateOnPageLoad")
            # disable google analytics and amp analytics
            self.send_to_chrome(
-                method='Network.setBlockedURLs',
+                method="Network.setBlockedURLs",
-                params={'urls': ['*google-analytics.com/analytics.js*',
+                params={
-                                 '*google-analytics.com/ga.js*',
+                    "urls": [
-                                 '*google-analytics.com/ga_exp.js*',
+                        "*google-analytics.com/analytics.js*",
-                                 '*google-analytics.com/urchin.js*',
+                        "*google-analytics.com/ga.js*",
-                                 '*google-analytics.com/collect*',
+                        "*google-analytics.com/ga_exp.js*",
-                                 '*google-analytics.com/r/collect*',
+                        "*google-analytics.com/urchin.js*",
-                                 '*google-analytics.com/__utm.gif*',
+                        "*google-analytics.com/collect*",
-                                 '*google-analytics.com/gtm/js?*',
+                        "*google-analytics.com/r/collect*",
-                                 '*google-analytics.com/cx/api.js*',
+                        "*google-analytics.com/__utm.gif*",
-                                 '*cdn.ampproject.org/*/amp-analytics*.js']})
+                        "*google-analytics.com/gtm/js?*",
                        "*google-analytics.com/cx/api.js*",
                        "*cdn.ampproject.org/*/amp-analytics*.js",
                    ]
                },
            )
    def stop(self):
-        '''
+        """
        Stops chrome if it's running.
-        '''
+        """
        try:
-            if (self.websock and self.websock.sock
+            if self.websock and self.websock.sock and self.websock.sock.connected:
-                    and self.websock.sock.connected):
+                self.logger.info("shutting down websocket connection")
                self.logger.info('shutting down websocket connection')
                try:
                    self.websock.close()
                except BaseException as e:
                    self.logger.error(
-                            'exception closing websocket %s - %s',
+                        "exception closing websocket %s - %s", self.websock, e
-                            self.websock, e)
+                    )
            self.chrome.stop()
            if self.websock_thread and (
-                    self.websock_thread != threading.current_thread()):
+                self.websock_thread != threading.current_thread()
            ):
                self.websock_thread.join(timeout=30)
                if self.websock_thread.is_alive():
                    self.logger.error(
-                            '%s still alive 30 seconds after closing %s, will '
+                        "%s still alive 30 seconds after closing %s, will "
-                            'forcefully nudge it again', self.websock_thread,
+                        "forcefully nudge it again",
-                            self.websock)
+                        self.websock_thread,
                        self.websock,
                    )
                    self.websock.keep_running = False
                    self.websock_thread.join(timeout=30)
                    if self.websock_thread.is_alive():
                        self.logger.critical(
-                                '%s still alive 60 seconds after closing %s',
+                            "%s still alive 60 seconds after closing %s",
-                                    self.websock_thread, self.websock)
+                            self.websock_thread,
                            self.websock,
                        )
            self.websock_url = None
        except:
-            self.logger.error('problem stopping', exc_info=True)
+            self.logger.error("problem stopping", exc_info=True)
    def is_running(self):
        return self.websock_url is not None
    def browse_page(
-            self, page_url, extra_headers=None,
+        self,
-            user_agent=None, behavior_parameters=None, behaviors_dir=None,
+        page_url,
-            on_request=None, on_response=None,
+        extra_headers=None,
-            on_service_worker_version_updated=None, on_screenshot=None,
+        user_agent=None,
-            username=None, password=None, hashtags=None,
+        behavior_parameters=None,
-            screenshot_full_page=False, skip_extract_outlinks=False,
+        behaviors_dir=None,
-            skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
+        on_request=None,
-            page_timeout=300, behavior_timeout=900,
+        on_response=None,
-            extract_outlinks_timeout=60, download_throughput=-1, stealth=False):
+        on_service_worker_version_updated=None,
-        '''
+        on_screenshot=None,
        username=None,
        password=None,
        hashtags=None,
        screenshot_full_page=False,
        skip_extract_outlinks=False,
        skip_visit_hashtags=False,
        skip_youtube_dl=False,
        simpler404=False,
        page_timeout=300,
        behavior_timeout=900,
        extract_outlinks_timeout=60,
        download_throughput=-1,
        stealth=False,
    ):
        """
        Browses page in browser.
        Browser should already be running, i.e. start() should have been
@ -473,54 +528,60 @@ class Browser:
        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
-        '''
+        """
        if not self.is_running():
-            raise BrowsingException('browser has not been started')
+            raise BrowsingException("browser has not been started")
        if self.is_browsing:
-            raise BrowsingException('browser is already busy browsing a page')
+            raise BrowsingException("browser is already busy browsing a page")
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
-            self.websock_thread.on_service_worker_version_updated = \
+            self.websock_thread.on_service_worker_version_updated = (
                on_service_worker_version_updated
            )
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                    extra_headers=extra_headers,
                    user_agent=user_agent,
                    download_throughput=download_throughput,
-                        stealth=stealth)
+                    stealth=stealth,
                )
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
-                    if page_url != self.url().split('#')[0]:
+                    if page_url != self.url().split("#")[0]:
                        self.logger.debug(
-                            'login navigated away from %s; returning!',
+                            "login navigated away from %s; returning!", page_url
-                            page_url)
+                        )
                        self.navigate_to_page(page_url, timeout=page_timeout)
                # If the target page HTTP status is 4xx/5xx, there is no point
                # in running behaviors, screenshot, outlink and hashtag
                # extraction as we didn't get a valid page.
                # This is only enabled with option `simpler404`.
                run_behaviors = True
-                if simpler404 and (self.websock_thread.page_status is None or
+                if simpler404 and (
-                                   self.websock_thread.page_status >= 400):
+                    self.websock_thread.page_status is None
                    or self.websock_thread.page_status >= 400
                ):
                    run_behaviors = False
                if run_behaviors and behavior_timeout > 0:
                    behavior_script = brozzler.behavior_script(
-                            page_url, behavior_parameters,
+                        page_url, behavior_parameters, behaviors_dir=behaviors_dir
-                            behaviors_dir=behaviors_dir)
+                    )
                    self.run_behavior(behavior_script, timeout=behavior_timeout)
                final_page_url = self.url()
                if on_screenshot:
                    if simpler404:
-                        if self.websock_thread.page_status and \
+                        if (
-                                self.websock_thread.page_status < 400:
+                            self.websock_thread.page_status
                            and self.websock_thread.page_status < 400
                        ):
                            self._try_screenshot(on_screenshot, screenshot_full_page)
                    else:
                        self._try_screenshot(on_screenshot, screenshot_full_page)
@ -528,9 +589,7 @@ class Browser:
                if not run_behaviors or skip_extract_outlinks:
                    outlinks = []
                else:
-                    outlinks = self.extract_outlinks(
+                    outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
                        timeout=extract_outlinks_timeout
                        )
                if run_behaviors and not skip_visit_hashtags:
                    self.visit_hashtags(final_page_url, hashtags, outlinks)
                return final_page_url, outlinks
@ -539,7 +598,7 @@ class Browser:
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
-            self.logger.error('websocket closed, did chrome die?')
+            self.logger.error("websocket closed, did chrome die?")
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
@ -550,21 +609,24 @@ class Browser:
        """The browser instance must be scrolled to the top of the page before
        trying to get a screenshot.
        """
-        self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
+        self.send_to_chrome(
-                            params={'expression': 'window.scroll(0,0)'})
+            method="Runtime.evaluate",
            suppress_logging=True,
            params={"expression": "window.scroll(0,0)"},
        )
        for i in range(3):
            try:
                jpeg_bytes = self.screenshot(full_page)
                on_screenshot(jpeg_bytes)
                return
            except BrowsingTimeout as e:
-                logging.error('attempt %s/3: %s', i+1, e)
+                logging.error("attempt %s/3: %s", i + 1, e)
    def visit_hashtags(self, page_url, hashtags, outlinks):
        _hashtags = set(hashtags or [])
        for outlink in outlinks:
            url = urlcanon.whatwg(outlink)
-            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
+            hashtag = (url.hash_sign + url.fragment).decode("utf-8")
            urlcanon.canon.remove_fragment(url)
            if hashtag and str(url) == page_url:
                _hashtags.add(hashtag)
@ -572,84 +634,85 @@ class Browser:
        # out which hashtags were visited already and skip those
        for hashtag in _hashtags:
            # navigate_to_hashtag (nothing to wait for so no timeout?)
-            self.logger.debug('navigating to hashtag %s', hashtag)
+            self.logger.debug("navigating to hashtag %s", hashtag)
            url = urlcanon.whatwg(page_url)
-            url.hash_sign = b'#'
+            url.hash_sign = b"#"
-            url.fragment = hashtag[1:].encode('utf-8')
+            url.fragment = hashtag[1:].encode("utf-8")
-            self.send_to_chrome(
+            self.send_to_chrome(method="Page.navigate", params={"url": str(url)})
                    method='Page.navigate', params={'url': str(url)})
            time.sleep(5)  # um.. wait for idleness or something?
            # take another screenshot?
            # run behavior again with short timeout?
            # retrieve outlinks again and append to list?
-    def configure_browser(self, extra_headers=None, user_agent=None,
+    def configure_browser(
-                          download_throughput=-1, stealth=False):
+        self, extra_headers=None, user_agent=None, download_throughput=-1, stealth=False
    ):
        headers = extra_headers or {}
-        headers['Accept-Encoding'] = 'gzip'  # avoid encodings br, sdch
+        headers["Accept-Encoding"] = "gzip"  # avoid encodings br, sdch
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(
-                method='Network.setExtraHTTPHeaders',
+            method="Network.setExtraHTTPHeaders", params={"headers": headers}
-                params={'headers': headers})
+        )
-        self._wait_for(
+        self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10)
                lambda: self.websock_thread.received_result(msg_id),
                timeout=10)
        if user_agent:
            msg_id = self.send_to_chrome(
-                    method='Network.setUserAgentOverride',
+                method="Network.setUserAgentOverride", params={"userAgent": user_agent}
-                    params={'userAgent': user_agent})
+            )
        if download_throughput > -1:
            # traffic shaping already used by SPN2 to aid warcprox resilience
            # parameter value as bytes/second, or -1 to disable (default)
-            msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
+            msg_id = self.send_to_chrome(
-                params={'downloadThroughput': download_throughput})
+                method="Network.emulateNetworkConditions",
                params={"downloadThroughput": download_throughput},
            )
        if stealth:
            self.websock_thread.expect_result(self._command_id.peek())
-            js = brozzler.jinja2_environment().get_template('stealth.js').render()
+            js = brozzler.jinja2_environment().get_template("stealth.js").render()
            msg_id = self.send_to_chrome(
-                method='Page.addScriptToEvaluateOnNewDocument',
+                method="Page.addScriptToEvaluateOnNewDocument", params={"source": js}
-                params={'source': js})
+            )
            self._wait_for(
-                lambda: self.websock_thread.received_result(msg_id),
+                lambda: self.websock_thread.received_result(msg_id), timeout=10
-                timeout=10)
+            )
    def navigate_to_page(self, page_url, timeout=300):
-        self.logger.info('navigating to page %s', page_url)
+        self.logger.info("navigating to page %s", page_url)
        self.websock_thread.got_page_load_event = None
        self.websock_thread.page_status = None
-        self.send_to_chrome(method='Page.navigate', params={'url': page_url})
+        self.send_to_chrome(method="Page.navigate", params={"url": page_url})
-        self._wait_for(
+        self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
                lambda: self.websock_thread.got_page_load_event,
                timeout=timeout)
    def extract_outlinks(self, timeout=60):
-        self.logger.info('extracting outlinks')
+        self.logger.info("extracting outlinks")
        self.websock_thread.expect_result(self._command_id.peek())
-        js = brozzler.jinja2_environment().get_template(
+        js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render()
                'extract-outlinks.js').render()
        msg_id = self.send_to_chrome(
-                method='Runtime.evaluate', params={'expression': js})
+            method="Runtime.evaluate", params={"expression": js}
        )
        self._wait_for(
-                lambda: self.websock_thread.received_result(msg_id),
+            lambda: self.websock_thread.received_result(msg_id), timeout=timeout
-                timeout=timeout)
+        )
        message = self.websock_thread.pop_result(msg_id)
-        if ('result' in message and 'result' in message['result']
+        if (
-                and 'value' in message['result']['result']):
+            "result" in message
-            if message['result']['result']['value']:
+            and "result" in message["result"]
            and "value" in message["result"]["result"]
        ):
            if message["result"]["result"]["value"]:
                out = []
-                for link in message['result']['result']['value'].split('\n'):
+                for link in message["result"]["result"]["value"].split("\n"):
                    try:
                        out.append(str(urlcanon.whatwg(link)))
                    except AddressValueError:
-                        self.logger.warning('skip invalid outlink: %s', link)
+                        self.logger.warning("skip invalid outlink: %s", link)
                return frozenset(out)
            else:
                # no links found
                return frozenset()
        else:
            self.logger.error(
-                    'problem extracting outlinks, result message: %s', message)
+                "problem extracting outlinks, result message: %s", message
            )
            return frozenset()
    def screenshot(self, full_page=False, timeout=45):
@ -657,121 +720,141 @@ class Browser:
        inspiration:
        https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
        """
-        self.logger.info('taking screenshot')
+        self.logger.info("taking screenshot")
        if full_page:
            self.websock_thread.expect_result(self._command_id.peek())
-            msg_id = self.send_to_chrome(method='Page.getLayoutMetrics')
+            msg_id = self.send_to_chrome(method="Page.getLayoutMetrics")
            self._wait_for(
-                lambda: self.websock_thread.received_result(msg_id),
+                lambda: self.websock_thread.received_result(msg_id), timeout=timeout
-                timeout=timeout)
+            )
            message = self.websock_thread.pop_result(msg_id)
-            width = message['result']['contentSize']['width']
+            width = message["result"]["contentSize"]["width"]
-            height = message['result']['contentSize']['height']
+            height = message["result"]["contentSize"]["height"]
            clip = dict(x=0, y=0, width=width, height=height, scale=1)
            deviceScaleFactor = 1
-            screenOrientation = {'angle': 0, 'type': 'portraitPrimary'}
+            screenOrientation = {"angle": 0, "type": "portraitPrimary"}
            self.send_to_chrome(
-                method='Emulation.setDeviceMetricsOverride',
+                method="Emulation.setDeviceMetricsOverride",
-                params=dict(mobile=False, width=width, height=height,
+                params=dict(
                    mobile=False,
                    width=width,
                    height=height,
                    deviceScaleFactor=deviceScaleFactor,
-                            screenOrientation=screenOrientation)
+                    screenOrientation=screenOrientation,
                ),
            )
-            capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
+            capture_params = {"format": "jpeg", "quality": 95, "clip": clip}
        else:
-            capture_params = {'format': 'jpeg', 'quality': 95}
+            capture_params = {"format": "jpeg", "quality": 95}
        self.websock_thread.expect_result(self._command_id.peek())
-        msg_id = self.send_to_chrome(method='Page.captureScreenshot',
+        msg_id = self.send_to_chrome(
-                                     params=capture_params)
+            method="Page.captureScreenshot", params=capture_params
        )
        self._wait_for(
-                lambda: self.websock_thread.received_result(msg_id),
+            lambda: self.websock_thread.received_result(msg_id), timeout=timeout
-                timeout=timeout)
+        )
        message = self.websock_thread.pop_result(msg_id)
-        jpeg_bytes = base64.b64decode(message['result']['data'])
+        jpeg_bytes = base64.b64decode(message["result"]["data"])
        return jpeg_bytes
    def url(self, timeout=30):
-        '''
+        """
        Returns value of document.URL from the browser.
-        '''
+        """
        self.websock_thread.expect_result(self._command_id.peek())
        msg_id = self.send_to_chrome(
-                method='Runtime.evaluate',
+            method="Runtime.evaluate", params={"expression": "document.URL"}
-                params={'expression': 'document.URL'})
+        )
        self._wait_for(
-                lambda: self.websock_thread.received_result(msg_id),
+            lambda: self.websock_thread.received_result(msg_id), timeout=timeout
-                timeout=timeout)
+        )
        message = self.websock_thread.pop_result(msg_id)
-        return message['result']['result']['value']
+        return message["result"]["result"]["value"]
    def run_behavior(self, behavior_script, timeout=900):
        self.send_to_chrome(
-                method='Runtime.evaluate', suppress_logging=True,
+            method="Runtime.evaluate",
-                params={'expression': behavior_script})
+            suppress_logging=True,
            params={"expression": behavior_script},
        )
        check_interval = min(timeout, 7)
        start = time.time()
        while True:
            elapsed = time.time() - start
            if elapsed > timeout:
-                logging.info(
+                logging.info("behavior reached hard timeout after %.1fs", elapsed)
                        'behavior reached hard timeout after %.1fs', elapsed)
                return
            brozzler.sleep(check_interval)
            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
-                     method='Runtime.evaluate', suppress_logging=True,
+                method="Runtime.evaluate",
-                     params={'expression': 'umbraBehaviorFinished()'})
+                suppress_logging=True,
                params={"expression": "umbraBehaviorFinished()"},
            )
            try:
                self._wait_for(
-                        lambda: self.websock_thread.received_result(msg_id),
+                    lambda: self.websock_thread.received_result(msg_id), timeout=5
-                        timeout=5)
+                )
                msg = self.websock_thread.pop_result(msg_id)
-                if (msg and 'result' in msg
+                if (
-                        and not ('exceptionDetails' in msg['result'])
+                    msg
-                        and not ('wasThrown' in msg['result']
+                    and "result" in msg
-                            and msg['result']['wasThrown'])
+                    and not ("exceptionDetails" in msg["result"])
-                        and 'result' in msg['result']
+                    and not (
-                        and type(msg['result']['result']['value']) == bool
+                        "wasThrown" in msg["result"] and msg["result"]["wasThrown"]
-                        and msg['result']['result']['value']):
+                    )
-                    self.logger.info('behavior decided it has finished')
+                    and "result" in msg["result"]
                    and type(msg["result"]["result"]["value"]) == bool
                    and msg["result"]["result"]["value"]
                ):
                    self.logger.info("behavior decided it has finished")
                    return
            except BrowsingTimeout:
                pass
    def try_login(self, username, password, timeout=300):
-        try_login_js = brozzler.jinja2_environment().get_template(
+        try_login_js = (
-                'try-login.js.j2').render(username=username, password=password)
+            brozzler.jinja2_environment()
            .get_template("try-login.js.j2")
            .render(username=username, password=password)
        )
        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(
-                method='Runtime.evaluate', suppress_logging=True,
+            method="Runtime.evaluate",
-                params={'expression': try_login_js})
+            suppress_logging=True,
            params={"expression": try_login_js},
        )
        # wait for tryLogin to finish trying (should be very very quick)
        start = time.time()
        while True:
            self.websock_thread.expect_result(self._command_id.peek())
            msg_id = self.send_to_chrome(
-                method='Runtime.evaluate',
+                method="Runtime.evaluate",
-                params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'})
+                params={
                    "expression": 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'
                },
            )
            try:
                self._wait_for(
-                        lambda: self.websock_thread.received_result(msg_id),
+                    lambda: self.websock_thread.received_result(msg_id), timeout=5
-                        timeout=5)
+                )
                msg = self.websock_thread.pop_result(msg_id)
-                if (msg and 'result' in msg
+                if msg and "result" in msg and "result" in msg["result"]:
-                        and 'result' in msg['result']):
+                    result = msg["result"]["result"]["value"]
-                    result = msg['result']['result']['value']
+                    if result == "login-form-not-found":
                    if result == 'login-form-not-found':
                        # we're done
                        return
-                    elif result in ('submitted-form', 'maybe-submitted-form'):
+                    elif result in ("submitted-form", "maybe-submitted-form"):
                        # wait for page load event below
                        self.logger.info(
-                                'submitted a login form, waiting for another '
+                            "submitted a login form, waiting for another "
-                                'page load event')
+                            "page load event"
                        )
                        break
                    # else try again to get __brzl_tryLoginState
@ -780,23 +863,23 @@ class Browser:
            if time.time() - start > 30:
                raise BrowsingException(
-                        'timed out trying to check if tryLogin finished')
+                    "timed out trying to check if tryLogin finished"
                )
        # if we get here, we submitted a form, now we wait for another page
        # load event
-        self._wait_for(
+        self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
-                lambda: self.websock_thread.got_page_load_event,
+
                timeout=timeout)
 class Counter:
    def __init__(self):
        self.next_value = 0
    def __next__(self):
        try:
            return self.next_value
        finally:
            self.next_value += 1
    def peek(self):
        return self.next_value
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/chrome.py - manages the chrome/chromium browser for brozzler
 Copyright (C) 2014-2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import logging
 import urllib.request
@ -31,12 +31,13 @@ import json
 import tempfile
 import sys
 def check_version(chrome_exe):
-    '''
+    """
    Raises SystemExit if `chrome_exe` is not a supported browser version.
    Must run in the main thread to have the desired effect.
-    '''
+    """
    # mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
    # Google Chrome 64.0.3282.140
    # mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
@ -45,25 +46,28 @@ def check_version(chrome_exe):
    # Using PPAPI flash.
    #  --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
    # Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
-    cmd = [chrome_exe, '--version']
+    cmd = [chrome_exe, "--version"]
    out = subprocess.check_output(cmd, timeout=60)
-    m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
+    m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out)
    if not m:
        sys.exit(
-                'unable to parse browser version from output of '
+            "unable to parse browser version from output of "
-                '%r: %r' % (subprocess.list2cmdline(cmd), out))
+            "%r: %r" % (subprocess.list2cmdline(cmd), out)
        )
    version_str = m.group(2).decode()
-    major_version = int(version_str.split('.')[0])
+    major_version = int(version_str.split(".")[0])
    if major_version < 64:
-        sys.exit('brozzler requires chrome/chromium version 64 or '
+        sys.exit(
-                 'later but %s reports version %s' % (
+            "brozzler requires chrome/chromium version 64 or "
-                     chrome_exe, version_str))
+            "later but %s reports version %s" % (chrome_exe, version_str)
        )
 class Chrome:
-    logger = logging.getLogger(__module__ + '.' + __qualname__)
+    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
-        '''
+        """
        Initializes instance of this class.
        Doesn't start the browser, start() does that.
@ -73,7 +77,7 @@ class Chrome:
            port: chrome debugging protocol port (default 9222)
            ignore_cert_errors: configure chrome to accept all certs (default
                False)
-        '''
+        """
        self.port = port
        self.chrome_exe = chrome_exe
        self.ignore_cert_errors = ignore_cert_errors
@ -81,63 +85,72 @@ class Chrome:
        self.chrome_process = None
    def __enter__(self):
-        '''
+        """
        Returns websocket url to chrome window with about:blank loaded.
-        '''
+        """
        return self.start()
    def __exit__(self, *args):
        self.stop()
    def _init_cookie_db(self, cookie_db):
-        cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
+        cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
-        cookie_location = os.path.join(cookie_dir, 'Cookies')
+        cookie_location = os.path.join(cookie_dir, "Cookies")
-        self.logger.debug('cookie DB provided, writing to %s', cookie_location)
+        self.logger.debug("cookie DB provided, writing to %s", cookie_location)
        os.makedirs(cookie_dir, exist_ok=True)
        try:
-            with open(cookie_location, 'wb') as cookie_file:
+            with open(cookie_location, "wb") as cookie_file:
                cookie_file.write(cookie_db)
        except OSError:
            self.logger.error(
-                    'exception writing cookie file at %s',
+                "exception writing cookie file at %s", cookie_location, exc_info=True
-                    cookie_location, exc_info=True)
+            )
    def persist_and_read_cookie_db(self):
-        cookie_location = os.path.join(
+        cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
                self._chrome_user_data_dir, 'Default', 'Cookies')
        self.logger.debug(
-                'marking cookies persistent then reading file into memory: %s',
+            "marking cookies persistent then reading file into memory: %s",
-                cookie_location)
+            cookie_location,
        )
        try:
            with sqlite3.connect(cookie_location) as conn:
                cur = conn.cursor()
-                cur.execute('UPDATE cookies SET is_persistent = 1')
+                cur.execute("UPDATE cookies SET is_persistent = 1")
        except sqlite3.Error:
            try:
                # db schema changed around version 66, this is the old schema
                with sqlite3.connect(cookie_location) as conn:
                    cur = conn.cursor()
-                    cur.execute('UPDATE cookies SET persistent = 1')
+                    cur.execute("UPDATE cookies SET persistent = 1")
            except sqlite3.Error:
                self.logger.error(
-                        'exception updating cookie DB %s', cookie_location,
+                    "exception updating cookie DB %s", cookie_location, exc_info=True
-                        exc_info=True)
+                )
        cookie_db = None
        try:
-            with open(cookie_location, 'rb') as cookie_file:
+            with open(cookie_location, "rb") as cookie_file:
                cookie_db = cookie_file.read()
        except OSError:
            self.logger.error(
-                    'exception reading from cookie DB file %s',
+                "exception reading from cookie DB file %s",
-                    cookie_location, exc_info=True)
+                cookie_location,
                exc_info=True,
            )
        return cookie_db
-    def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
+    def start(
-              disk_cache_size=None, websocket_timeout=60,
+        self,
-              window_height=900, window_width=1400):
+        proxy=None,
-        '''
+        cookie_db=None,
        disk_cache_dir=None,
        disk_cache_size=None,
        websocket_timeout=60,
        window_height=900,
        window_width=1400,
    ):
        """
        Starts chrome/chromium process.
        Args:
@ -154,103 +167,126 @@ class Chrome:
            window_height, window_width: window height and width, in pixels
        Returns:
            websocket url to chrome window with about:blank loaded
-        '''
+        """
        # these can raise exceptions
        self._home_tmpdir = tempfile.TemporaryDirectory()
        self._chrome_user_data_dir = os.path.join(
-            self._home_tmpdir.name, 'chrome-user-data')
+            self._home_tmpdir.name, "chrome-user-data"
        )
        if cookie_db:
            self._init_cookie_db(cookie_db)
        self._shutdown.clear()
        new_env = os.environ.copy()
-        new_env['HOME'] = self._home_tmpdir.name
+        new_env["HOME"] = self._home_tmpdir.name
        chrome_args = [
            self.chrome_exe,
-                '-v',
+            "-v",
-                '--headless',
+            "--headless",
-                '--remote-debugging-port=%s' % self.port,
+            "--remote-debugging-port=%s" % self.port,
-                '--use-mock-keychain', # mac thing
+            "--use-mock-keychain",  # mac thing
-                '--user-data-dir=%s' % self._chrome_user_data_dir,
+            "--user-data-dir=%s" % self._chrome_user_data_dir,
-                '--disable-background-networking', '--disable-breakpad',
+            "--disable-background-networking",
-                '--disable-renderer-backgrounding', '--disable-hang-monitor',
+            "--disable-breakpad",
-                '--disable-background-timer-throttling', '--mute-audio',
+            "--disable-renderer-backgrounding",
-                '--disable-web-sockets',
+            "--disable-hang-monitor",
-                f'--window-size={window_width},{window_height}',
+            "--disable-background-timer-throttling",
-                '--no-default-browser-check',
+            "--mute-audio",
-                '--disable-first-run-ui', '--no-first-run',
+            "--disable-web-sockets",
-                '--homepage=about:blank', '--disable-direct-npapi-requests',
+            f"--window-size={window_width},{window_height}",
-                '--disable-web-security', '--disable-notifications',
+            "--no-default-browser-check",
-                '--disable-extensions', '--disable-save-password-bubble',
+            "--disable-first-run-ui",
-                '--disable-sync']
+            "--no-first-run",
            "--homepage=about:blank",
            "--disable-direct-npapi-requests",
            "--disable-web-security",
            "--disable-notifications",
            "--disable-extensions",
            "--disable-save-password-bubble",
            "--disable-sync",
        ]
-        extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
+        extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS")
        if extra_chrome_args:
            chrome_args.extend(extra_chrome_args.split())
        if disk_cache_dir:
-            chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
+            chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir)
        if disk_cache_size:
-            chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
+            chrome_args.append("--disk-cache-size=%s" % disk_cache_size)
        if self.ignore_cert_errors:
-            chrome_args.append('--ignore-certificate-errors')
+            chrome_args.append("--ignore-certificate-errors")
        if proxy:
-            chrome_args.append('--proxy-server=%s' % proxy)
+            chrome_args.append("--proxy-server=%s" % proxy)
-        chrome_args.append('about:blank')
+        chrome_args.append("about:blank")
-        self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
+        self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
        # start_new_session - new process group so we can kill the whole group
        self.chrome_process = subprocess.Popen(
-                chrome_args, env=new_env, start_new_session=True,
+            chrome_args,
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
+            env=new_env,
            start_new_session=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            bufsize=0,
        )
        self._out_reader_thread = threading.Thread(
            target=self._read_stderr_stdout,
-                name='ChromeOutReaderThread:%s' % self.port, daemon=True)
+            name="ChromeOutReaderThread:%s" % self.port,
            daemon=True,
        )
        self._out_reader_thread.start()
-        self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
+        self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
        return self._websocket_url(timeout_sec=websocket_timeout)
    def _websocket_url(self, timeout_sec=60):
-        json_url = 'http://localhost:%s/json' % self.port
+        json_url = "http://localhost:%s/json" % self.port
        # make this a member variable so that kill -QUIT reports it
        self._start = time.time()
        self._last_warning = self._start
        while True:
            try:
                raw_json = urllib.request.urlopen(json_url, timeout=30).read()
-                all_debug_info = json.loads(raw_json.decode('utf-8'))
+                all_debug_info = json.loads(raw_json.decode("utf-8"))
-                debug_info = [x for x in all_debug_info
+                debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
                              if x['url'] == 'about:blank']
-                if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
+                if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
-                    self.logger.debug('%s returned %s', json_url, raw_json)
+                    self.logger.debug("%s returned %s", json_url, raw_json)
-                    url = debug_info[0]['webSocketDebuggerUrl']
+                    url = debug_info[0]["webSocketDebuggerUrl"]
                    self.logger.info(
-                            'got chrome window websocket debug url %s from %s',
+                        "got chrome window websocket debug url %s from %s",
-                            url, json_url)
+                        url,
                        json_url,
                    )
                    return url
            except brozzler.ShutdownRequested:
                raise
            except Exception as e:
                if time.time() - self._last_warning > 30:
                    self.logger.warning(
-                            'problem with %s (will keep trying until timeout '
+                        "problem with %s (will keep trying until timeout "
-                            'of %d seconds): %s', json_url, timeout_sec, e)
+                        "of %d seconds): %s",
                        json_url,
                        timeout_sec,
                        e,
                    )
                    self._last_warning = time.time()
            finally:
                e = None
                if self.chrome_process:
                    if time.time() - self._start > timeout_sec:
                        e = Exception(
-                                'killing chrome, failed to retrieve %s after '
+                            "killing chrome, failed to retrieve %s after "
-                                '%s seconds' % (
+                            "%s seconds" % (json_url, time.time() - self._start)
-                                    json_url, time.time() - self._start))
+                        )
                    elif self.chrome_process.poll() is not None:
                        e = Exception(
-                                'chrome process died with status %s' % self.chrome_process.poll())
+                            "chrome process died with status %s"
                            % self.chrome_process.poll()
                        )
                    else:
                        time.sleep(0.5)
                else:
-                    e = Exception('??? self.chrome_process is not set ???')
+                    e = Exception("??? self.chrome_process is not set ???")
                if e:
                    self.stop()
                    raise e
@ -258,11 +294,13 @@ class Chrome:
    def _read_stderr_stdout(self):
        # XXX select doesn't work on windows
        def readline_nonblock(f):
-            buf = b''
+            buf = b""
            try:
-                while not self._shutdown.is_set() and (
+                while (
-                    len(buf) == 0 or buf[-1] != 0xa) and select.select(
+                    not self._shutdown.is_set()
-                            [f],[],[],0.5)[0]:
+                    and (len(buf) == 0 or buf[-1] != 0xA)
                    and select.select([f], [], [], 0.5)[0]
                ):
                    buf += f.read(1)
            except (ValueError, OSError):
                # When the chrome process crashes, stdout & stderr are closed
@ -276,16 +314,16 @@ class Chrome:
                buf = readline_nonblock(self.chrome_process.stdout)
                if buf:
                    self.logger.trace(
-                            'chrome pid %s STDOUT %s',
+                        "chrome pid %s STDOUT %s", self.chrome_process.pid, buf
-                            self.chrome_process.pid, buf)
+                    )
                buf = readline_nonblock(self.chrome_process.stderr)
                if buf:
                    self.logger.trace(
-                            'chrome pid %s STDERR %s',
+                        "chrome pid %s STDERR %s", self.chrome_process.pid, buf
-                            self.chrome_process.pid, buf)
+                    )
        except:
-            self.logger.error('unexpected exception', exc_info=True)
+            self.logger.error("unexpected exception", exc_info=True)
    def stop(self):
        if not self.chrome_process or self._shutdown.is_set():
@ -294,8 +332,7 @@ class Chrome:
        timeout_sec = 300
        if self.chrome_process.poll() is None:
-            self.logger.info(
+            self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
                    'terminating chrome pgid %s', self.chrome_process.pid)
            os.killpg(self.chrome_process.pid, signal.SIGTERM)
        t0 = time.time()
@ -306,12 +343,14 @@ class Chrome:
                if status is not None:
                    if status == 0:
                        self.logger.info(
-                                'chrome pid %s exited normally',
+                            "chrome pid %s exited normally", self.chrome_process.pid
-                                self.chrome_process.pid)
+                        )
                    else:
                        self.logger.warning(
-                                'chrome pid %s exited with nonzero status %s',
+                            "chrome pid %s exited with nonzero status %s",
-                                self.chrome_process.pid, status)
+                            self.chrome_process.pid,
                            status,
                        )
                    # XXX I would like to forcefully kill the process group
                    # here to guarantee no orphaned chromium subprocesses hang
@ -321,14 +360,18 @@ class Chrome:
                time.sleep(0.5)
            self.logger.warning(
-                    'chrome pid %s still alive %.1f seconds after sending '
+                "chrome pid %s still alive %.1f seconds after sending "
-                    'SIGTERM, sending SIGKILL', self.chrome_process.pid,
+                "SIGTERM, sending SIGKILL",
-                    time.time() - t0)
+                self.chrome_process.pid,
                time.time() - t0,
            )
            os.killpg(self.chrome_process.pid, signal.SIGKILL)
            status = self.chrome_process.wait()
            self.logger.warning(
-                    'chrome pid %s reaped (status=%s) after killing with '
+                "chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
-                    'SIGKILL', self.chrome_process.pid, status)
+                self.chrome_process.pid,
                status,
            )
        finally:
            self.chrome_process.stdout.close()
@ -337,8 +380,7 @@ class Chrome:
                self._home_tmpdir.cleanup()
            except:
                self.logger.error(
-                        'exception deleting %s', self._home_tmpdir,
+                    "exception deleting %s", self._home_tmpdir, exc_info=True
-                        exc_info=True)
+                )
            self._out_reader_thread.join()
            self.chrome_process = None
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
 endspoints etc
@ -15,17 +15,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import logging
 import sys
 try:
    import flask
 except ImportError as e:
    logging.critical(
        '%s: %s\n\nYou might need to run "pip install '
        'brozzler[dashboard]".\nSee README.rst for more information.',
-            type(e).__name__, e)
+        type(e).__name__,
        e,
    )
    sys.exit(1)
 import doublethink
 import json
@ -41,33 +44,44 @@ app = flask.Flask(__name__)
 # configure with environment variables
 SETTINGS = {
-    'RETHINKDB_SERVERS': os.environ.get(
+    "RETHINKDB_SERVERS": os.environ.get(
-        'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
+        "BROZZLER_RETHINKDB_SERVERS", "localhost"
-    'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
+    ).split(","),
-    'WAYBACK_BASEURL': os.environ.get(
+    "RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
-        'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
+    "WAYBACK_BASEURL": os.environ.get(
-    'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'),
+        "WAYBACK_BASEURL", "http://localhost:8880/brozzler"
-    'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost')
+    ),
    "DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
    "DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
 }
-rr = doublethink.Rethinker(
+rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
        SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
 _svc_reg = None
 def service_registry():
    global _svc_reg
    if not _svc_reg:
        _svc_reg = doublethink.ServiceRegistry(rr)
    return _svc_reg
@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
 def queued_count(site_id):
-    reql = rr.table("pages").between(
+    reql = (
-            [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
+        rr.table("pages")
-            index="priority_by_site").count()
+        .between(
            [site_id, 0, False, r.minval],
            [site_id, 0, False, r.maxval],
            index="priority_by_site",
        )
        .count()
    )
    logging.debug("querying rethinkdb: %s", reql)
    count = reql.run()
    return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
 def queue(site_id):
@ -75,38 +89,52 @@ def queue(site_id):
    start = flask.request.args.get("start", 0)
    end = flask.request.args.get("end", start + 90)
    reql = rr.table("pages").between(
-            [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
+        [site_id, 0, False, r.minval],
-            index="priority_by_site")[start:end]
+        [site_id, 0, False, r.maxval],
        index="priority_by_site",
    )[start:end]
    logging.debug("querying rethinkdb: %s", reql)
    queue_ = reql.run()
    return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count")
@app.route("/api/site/<site_id>/pages_count")
@app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count")
 def page_count(site_id):
-    reql = rr.table("pages").between(
+    reql = (
        rr.table("pages")
        .between(
            [site_id, 1, False, r.minval],
            [site_id, r.maxval, False, r.maxval],
-            index="priority_by_site").count()
+            index="priority_by_site",
        )
        .count()
    )
    logging.debug("querying rethinkdb: %s", reql)
    count = reql.run()
    return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages")
 def pages(site_id):
    """Pages already crawled."""
    start = int(flask.request.args.get("start", 0))
    end = int(flask.request.args.get("end", start + 90))
-    reql = rr.table("pages").between(
+    reql = (
-            [site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
+        rr.table("pages")
-            index="least_hops").order_by(index="least_hops")[start:end]
+        .between(
            [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
        )
        .order_by(index="least_hops")[start:end]
    )
    logging.debug("querying rethinkdb: %s", reql)
    pages_ = reql.run()
    return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>")
 def page(page_id):
@ -115,6 +143,7 @@ def page(page_id):
    page_ = reql.run()
    return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml")
 def page_yaml(page_id):
@ -122,8 +151,9 @@ def page_yaml(page_id):
    logging.debug("querying rethinkdb: %s", reql)
    page_ = reql.run()
    return app.response_class(
-            yaml.dump(page_, default_flow_style=False),
+        yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
-            mimetype="application/yaml")
+    )
@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
@ -135,6 +165,7 @@ def site(site_id):
        s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
    return flask.jsonify(s)
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
 def site_yaml(site_id):
@ -142,8 +173,9 @@ def site_yaml(site_id):
    logging.debug("querying rethinkdb: %s", reql)
    site_ = reql.run()
    return app.response_class(
-            yaml.dump(site_, default_flow_style=False),
+        yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
-            mimetype="application/yaml")
+    )
@app.route("/api/stats/<bucket>")
 def stats(bucket):
@ -152,6 +184,7 @@ def stats(bucket):
    stats_ = reql.run()
    return flask.jsonify(stats_)
@app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites")
 def sites(job_id):
@ -168,6 +201,7 @@ def sites(job_id):
            s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
    return flask.jsonify(sites=sites_)
@app.route("/api/jobless-sites")
 def jobless_sites():
    # XXX inefficient (unindexed) query
@ -180,6 +214,7 @@ def jobless_sites():
            s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
    return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>")
 def job(job_id):
@ -192,6 +227,7 @@ def job(job_id):
    job_ = reql.run()
    return flask.jsonify(job_)
@app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml")
 def job_yaml(job_id):
@ -203,19 +239,22 @@ def job_yaml(job_id):
    logging.debug("querying rethinkdb: %s", reql)
    job_ = reql.run()
    return app.response_class(
-            yaml.dump(job_, default_flow_style=False),
+        yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
-            mimetype="application/yaml")
+    )
@app.route("/api/workers")
 def workers():
    workers_ = service_registry().available_services("brozzler-worker")
    return flask.jsonify(workers=list(workers_))
@app.route("/api/services")
 def services():
    services_ = service_registry().available_services()
    return flask.jsonify(services=list(services_))
@app.route("/api/jobs")
 def jobs():
    reql = rr.table("jobs").order_by(r.desc("id"))
@ -223,20 +262,24 @@ def jobs():
    jobs_ = list(reql.run())
    return flask.jsonify(jobs=jobs_)
@app.route("/api/config")
 def config():
    return flask.jsonify(config=SETTINGS)
@app.route("/api/<path:path>")
@app.route("/api", defaults={"path": ""})
 def api404(path):
    flask.abort(404)
@app.route("/", defaults={"path": ""})
@app.route("/<path:path>")
 def root(path):
    return flask.render_template("index.html")
 try:
    import gunicorn.app.base
    from gunicorn.six import iteritems
@ -255,8 +298,12 @@ try:
        def load_config(self):
            config = dict(
-                    [(key, value) for key, value in iteritems(self.options)
+                [
-                        if key in self.cfg.settings and value is not None])
+                    (key, value)
                    for key, value in iteritems(self.options)
                    if key in self.cfg.settings and value is not None
                ]
            )
            for key, value in iteritems(config):
                self.cfg.set(key.lower(), value)
            self.cfg.set("logger_class", BypassGunicornLogging)
@ -270,37 +317,42 @@ try:
        GunicornBrozzlerDashboard(app, options).run()
 except ImportError:
    def run():
        logging.info("running brozzler-dashboard using simple flask app.run")
-        app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT'])
+        app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
 def main(argv=None):
    import argparse
    import brozzler.cli
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=(
-                'brozzler-dashboard - web application for viewing brozzler '
+            "brozzler-dashboard - web application for viewing brozzler " "crawl status"
-                'crawl status'),
+        ),
        epilog=(
-                'brozzler-dashboard has no command line options, but can be '
+            "brozzler-dashboard has no command line options, but can be "
-                'configured using the following environment variables:\n\n'
+            "configured using the following environment variables:\n\n"
-                '  BROZZLER_RETHINKDB_SERVERS   rethinkdb servers, e.g. '
+            "  BROZZLER_RETHINKDB_SERVERS   rethinkdb servers, e.g. "
-                'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
+            "db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
-                'localhost)\n'
+            "localhost)\n"
-                '  BROZZLER_RETHINKDB_DB        rethinkdb database name '
+            "  BROZZLER_RETHINKDB_DB        rethinkdb database name "
-                '(default: brozzler)\n'
+            "(default: brozzler)\n"
-                '  WAYBACK_BASEURL     base url for constructing wayback '
+            "  WAYBACK_BASEURL     base url for constructing wayback "
-                'links (default http://localhost:8880/brozzler)'
+            "links (default http://localhost:8880/brozzler)"
-                '  DASHBOARD_PORT   brozzler-dashboard listening port (default: 8000)\n'
+            "  DASHBOARD_PORT   brozzler-dashboard listening port (default: 8000)\n"
-                '  DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)'))
+            "  DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
        ),
    )
    brozzler.cli.add_common_options(arg_parser, argv)
    args = arg_parser.parse_args(args=argv[1:])
    brozzler.cli.configure_logging(args)
    run()
 if __name__ == "__main__":
    main()
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
 working together in a single process
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import sys
 import logging
 try:
    import warcprox
    import warcprox.main
@ -32,7 +33,9 @@ except ImportError as e:
    logging.critical(
        '%s: %s\n\nYou might need to run "pip install '
        'brozzler[easy]".\nSee README.rst for more information.',
-            type(e).__name__, e)
+        type(e).__name__,
        e,
    )
    sys.exit(1)
 import argparse
 import brozzler
@ -46,76 +49,112 @@ import doublethink
 import traceback
 import socketserver
 def _build_arg_parser(argv=None):
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
-            prog=os.path.basename(argv[0]), description=(
+        prog=os.path.basename(argv[0]),
-                'brozzler-easy - easy deployment of brozzler, with '
+        description=(
-                'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
+            "brozzler-easy - easy deployment of brozzler, with "
-                'running in a single process'))
+            "brozzler-worker, warcprox, pywb, and brozzler-dashboard all "
            "running in a single process"
        ),
    )
    # common args
    brozzler.cli.add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
-            '-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
+        "-d",
-            help='where to write warcs')
+        "--warcs-dir",
        dest="warcs_dir",
        default="./warcs",
        help="where to write warcs",
    )
    # warcprox args
    arg_parser.add_argument(
-            '-c', '--cacert', dest='cacert',
+        "-c",
-            default='./%s-warcprox-ca.pem' % socket.gethostname(),
+        "--cacert",
        dest="cacert",
        default="./%s-warcprox-ca.pem" % socket.gethostname(),
        help=(
-                'warcprox CA certificate file; if file does not exist, it '
+            "warcprox CA certificate file; if file does not exist, it "
-                'will be created'))
+            "will be created"
        ),
    )
    arg_parser.add_argument(
-            '--certs-dir', dest='certs_dir',
+        "--certs-dir",
-            default='./%s-warcprox-ca' % socket.gethostname(),
+        dest="certs_dir",
-            help='where warcprox will store and load generated certificates')
+        default="./%s-warcprox-ca" % socket.gethostname(),
        help="where warcprox will store and load generated certificates",
    )
    arg_parser.add_argument(
-            '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
+        "--onion-tor-socks-proxy",
-            default=None, help=(
+        dest="onion_tor_socks_proxy",
-                'host:port of tor socks proxy, used only to connect to '
+        default=None,
-                '.onion sites'))
+        help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
    )
    # brozzler-worker args
    arg_parser.add_argument(
-            '-e', '--chrome-exe', dest='chrome_exe',
+        "-e",
        "--chrome-exe",
        dest="chrome_exe",
        default=brozzler.cli.suggest_default_chrome_exe(),
-            help='executable to use to invoke chrome')
+        help="executable to use to invoke chrome",
    )
    arg_parser.add_argument(
-            '-n', '--max-browsers', dest='max_browsers',
+        "-n",
-            type=int, default=1, help=(
+        "--max-browsers",
-                'max number of chrome instances simultaneously '
+        dest="max_browsers",
-                'browsing pages'))
+        type=int,
        default=1,
        help=("max number of chrome instances simultaneously " "browsing pages"),
    )
    # pywb args
    arg_parser.add_argument(
-            '--pywb-address', dest='pywb_address',
+        "--pywb-address",
-            default='0.0.0.0',
+        dest="pywb_address",
-            help='pywb wayback address to listen on')
+        default="0.0.0.0",
        help="pywb wayback address to listen on",
    )
    arg_parser.add_argument(
-            '--pywb-port', dest='pywb_port', type=int,
+        "--pywb-port",
-            default=8880, help='pywb wayback port')
+        dest="pywb_port",
        type=int,
        default=8880,
        help="pywb wayback port",
    )
    # dashboard args
    arg_parser.add_argument(
-            '--dashboard-address', dest='dashboard_address',
+        "--dashboard-address",
-            default='localhost',
+        dest="dashboard_address",
-            help='brozzler dashboard address to listen on')
+        default="localhost",
        help="brozzler dashboard address to listen on",
    )
    arg_parser.add_argument(
-            '--dashboard-port', dest='dashboard_port',
+        "--dashboard-port",
-            type=int, default=8881, help='brozzler dashboard port')
+        dest="dashboard_port",
        type=int,
        default=8881,
        help="brozzler dashboard port",
    )
    # common at the bottom args
    brozzler.cli.add_common_options(arg_parser, argv)
    return arg_parser
 class ThreadingWSGIServer(
-        socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer):
+    socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer
 ):
    pass
 class BrozzlerEasyController:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -123,25 +162,31 @@ class BrozzlerEasyController:
        self.stop = threading.Event()
        self.args = args
        self.warcprox_controller = warcprox.controller.WarcproxController(
-                self._warcprox_opts(args))
+            self._warcprox_opts(args)
        )
        self.brozzler_worker = self._init_brozzler_worker(args)
        self.pywb_httpd = self._init_pywb(args)
        self.dashboard_httpd = self._init_brozzler_dashboard(args)
    def _init_brozzler_dashboard(self, args):
        return wsgiref.simple_server.make_server(
-                args.dashboard_address, args.dashboard_port,
+            args.dashboard_address,
-                brozzler.dashboard.app, ThreadingWSGIServer)
+            args.dashboard_port,
            brozzler.dashboard.app,
            ThreadingWSGIServer,
        )
    def _init_brozzler_worker(self, args):
-        rr = doublethink.Rethinker(
+        rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
                args.rethinkdb_servers.split(","), args.rethinkdb_db)
        frontier = brozzler.RethinkDbFrontier(rr)
        service_registry = doublethink.ServiceRegistry(rr)
        worker = brozzler.worker.BrozzlerWorker(
-                frontier, service_registry, chrome_exe=args.chrome_exe,
+            frontier,
-                proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
+            service_registry,
-                max_browsers=args.max_browsers)
+            chrome_exe=args.chrome_exe,
            proxy="%s:%s" % self.warcprox_controller.proxy.server_address,
            max_browsers=args.max_browsers,
        )
        return worker
    def _init_pywb(self, args):
@ -152,66 +197,67 @@ class BrozzlerEasyController:
        brozzler.pywb.monkey_patch_fuzzy_query()
        brozzler.pywb.monkey_patch_calc_search_range()
-        if args.warcs_dir.endswith('/'):
+        if args.warcs_dir.endswith("/"):
            warcs_dir = args.warcs_dir
        else:
-            warcs_dir = args.warcs_dir + '/'
+            warcs_dir = args.warcs_dir + "/"
        conf = {
-            'collections': {
+            "collections": {
-                'brozzler': {
+                "brozzler": {
-                    'index_paths': brozzler.pywb.RethinkCDXSource(
+                    "index_paths": brozzler.pywb.RethinkCDXSource(
                        servers=args.rethinkdb_servers.split(","),
-                        db=args.rethinkdb_db, table='captures')
+                        db=args.rethinkdb_db,
                        table="captures",
                    )
                },
            },
            # 'enable_http_proxy': True,
            # 'enable_memento': True,
-            'archive_paths': warcs_dir,
+            "archive_paths": warcs_dir,
-            'enable_cdx_api': True,
+            "enable_cdx_api": True,
-            'framed_replay': True,
+            "framed_replay": True,
-            'port': args.pywb_port,
+            "port": args.pywb_port,
-            'enable_auto_colls': False,
+            "enable_auto_colls": False,
        }
        wsgi_app = pywb.framework.wsgi_wrappers.init_app(
-                pywb.webapp.pywb_init.create_wb_router, config=conf,
+            pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False
-                load_yaml=False)
+        )
        # disable is_hop_by_hop restrictions
        wsgiref.handlers.is_hop_by_hop = lambda x: False
        return wsgiref.simple_server.make_server(
-                args.pywb_address, args.pywb_port, wsgi_app,
+            args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer
-                ThreadingWSGIServer)
+        )
    def start(self):
-        self.logger.info('starting warcprox')
+        self.logger.info("starting warcprox")
        self.warcprox_controller.start()
        # XXX wait til fully started?
-        self.logger.info('starting brozzler-worker')
+        self.logger.info("starting brozzler-worker")
        self.brozzler_worker.start()
-        self.logger.info(
+        self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
                'starting pywb at %s:%s', *self.pywb_httpd.server_address)
        threading.Thread(target=self.pywb_httpd.serve_forever).start()
        self.logger.info(
-                'starting brozzler-dashboard at %s:%s',
+            "starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
-                *self.dashboard_httpd.server_address)
+        )
        threading.Thread(target=self.dashboard_httpd.serve_forever).start()
    def shutdown(self):
-        self.logger.info('shutting down brozzler-dashboard')
+        self.logger.info("shutting down brozzler-dashboard")
        self.dashboard_httpd.shutdown()
-        self.logger.info('shutting down brozzler-worker')
+        self.logger.info("shutting down brozzler-worker")
        self.brozzler_worker.shutdown_now()
        # brozzler-worker is fully shut down at this point
-        self.logger.info('shutting down pywb')
+        self.logger.info("shutting down pywb")
        self.pywb_httpd.shutdown()
-        self.logger.info('shutting down warcprox')
+        self.logger.info("shutting down warcprox")
        self.warcprox_controller.shutdown()
    def wait_for_shutdown_request(self):
@ -222,14 +268,14 @@ class BrozzlerEasyController:
            self.shutdown()
    def _warcprox_opts(self, args):
-        '''
+        """
        Takes args as produced by the argument parser built by
        _build_arg_parser and builds warcprox arguments object suitable to pass
        to warcprox.main.init_controller. Copies some arguments, renames some,
        populates some with defaults appropriate for brozzler-easy, etc.
-        '''
+        """
        warcprox_opts = warcprox.Options()
-        warcprox_opts.address = 'localhost'
+        warcprox_opts.address = "localhost"
        # let the OS choose an available port; discover it later using
        # sock.getsockname()[1]
        warcprox_opts.port = 0
@ -237,17 +283,18 @@ class BrozzlerEasyController:
        warcprox_opts.certs_dir = args.certs_dir
        warcprox_opts.directory = args.warcs_dir
        warcprox_opts.gzip = True
-        warcprox_opts.prefix = 'brozzler'
+        warcprox_opts.prefix = "brozzler"
        warcprox_opts.size = 1000 * 1000 * 1000
        warcprox_opts.rollover_idle_time = 3 * 60
-        warcprox_opts.digest_algorithm = 'sha1'
+        warcprox_opts.digest_algorithm = "sha1"
        warcprox_opts.base32 = True
        warcprox_opts.stats_db_file = None
        warcprox_opts.playback_port = None
        warcprox_opts.playback_index_db_file = None
-        warcprox_opts.rethinkdb_big_table_url = (
+        warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % (
-                'rethinkdb://%s/%s/captures' % (
+            args.rethinkdb_servers,
-                    args.rethinkdb_servers, args.rethinkdb_db))
+            args.rethinkdb_db,
        )
        warcprox_opts.queue_size = 500
        warcprox_opts.max_threads = None
        warcprox_opts.profile = False
@ -259,9 +306,11 @@ class BrozzlerEasyController:
        for th in threading.enumerate():
            state_strs.append(str(th))
            stack = traceback.format_stack(sys._current_frames()[th.ident])
-            state_strs.append(''.join(stack))
+            state_strs.append("".join(stack))
-        logging.warning('dumping state (caught signal {})\n{}'.format(
+        logging.warning(
-            signum, '\n'.join(state_strs)))
+            "dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
        )
 def main(argv=None):
    argv = argv or sys.argv
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
 Copyright (C) 2014-2018 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import logging
 import brozzler
@ -27,9 +27,11 @@ import urlcanon
 r = rdb.RethinkDB()
 class UnexpectedDbResult(Exception):
    pass
 class RethinkDbFrontier:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -47,40 +49,49 @@ class RethinkDbFrontier:
        tables = self.rr.table_list().run()
        if not "sites" in tables:
            self.logger.info(
-                    "creating rethinkdb table 'sites' in database %r",
+                "creating rethinkdb table 'sites' in database %r", self.rr.dbname
-                    self.rr.dbname)
+            )
            self.rr.table_create(
-                    "sites", shards=self.shards, replicas=self.replicas).run()
+                "sites", shards=self.shards, replicas=self.replicas
-            self.rr.table("sites").index_create("sites_last_disclaimed", [
+            ).run()
-                r.row["status"], r.row["last_disclaimed"]]).run()
+            self.rr.table("sites").index_create(
                "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
            ).run()
            self.rr.table("sites").index_create("job_id").run()
        if not "pages" in tables:
            self.logger.info(
-                    "creating rethinkdb table 'pages' in database %r",
+                "creating rethinkdb table 'pages' in database %r", self.rr.dbname
-                    self.rr.dbname)
+            )
            self.rr.table_create(
-                    "pages", shards=self.shards, replicas=self.replicas).run()
+                "pages", shards=self.shards, replicas=self.replicas
-            self.rr.table("pages").index_create("priority_by_site", [
+            ).run()
-                r.row["site_id"], r.row["brozzle_count"],
+            self.rr.table("pages").index_create(
-                r.row["claimed"], r.row["priority"]]).run()
+                "priority_by_site",
                [
                    r.row["site_id"],
                    r.row["brozzle_count"],
                    r.row["claimed"],
                    r.row["priority"],
                ],
            ).run()
            # this index is for displaying pages in a sensible order in the web
            # console
-            self.rr.table("pages").index_create("least_hops", [
+            self.rr.table("pages").index_create(
-                r.row["site_id"], r.row["brozzle_count"],
+                "least_hops",
-                r.row["hops_from_seed"]]).run()
+                [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
            ).run()
        if not "jobs" in tables:
            self.logger.info(
-                    "creating rethinkdb table 'jobs' in database %r",
+                "creating rethinkdb table 'jobs' in database %r", self.rr.dbname
-                    self.rr.dbname)
+            )
            self.rr.table_create(
-                    "jobs", shards=self.shards, replicas=self.replicas).run()
+                "jobs", shards=self.shards, replicas=self.replicas
            ).run()
    def _vet_result(self, result, **kwargs):
        # self.logger.debug("vetting expected=%s result=%s", kwargs, result)
        # {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0}
-        for k in [
+        for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]:
                "replaced", "errors", "skipped", "inserted", "deleted",
                "unchanged"]:
            if k in kwargs:
                expected = kwargs[k]
            else:
@ -88,55 +99,81 @@ class RethinkDbFrontier:
            if isinstance(expected, list):
                if result.get(k) not in kwargs[k]:
                    raise UnexpectedDbResult(
-                            "expected %r to be one of %r in %r" % (
+                        "expected %r to be one of %r in %r" % (k, expected, result)
-                                k, expected, result))
+                    )
            else:
                if result.get(k) != expected:
-                    raise UnexpectedDbResult("expected %r to be %r in %r" % (
+                    raise UnexpectedDbResult(
-                        k, expected, result))
+                        "expected %r to be %r in %r" % (k, expected, result)
                    )
    def claim_sites(self, n=1):
-        self.logger.trace('claiming up to %s sites to brozzle', n)
+        self.logger.trace("claiming up to %s sites to brozzle", n)
        result = (
-            self.rr.table('sites').get_all(r.args(
+            self.rr.table("sites")
-                r.db(self.rr.dbname).table('sites', read_mode='majority')
+            .get_all(
                r.args(
                    r.db(self.rr.dbname)
                    .table("sites", read_mode="majority")
                    .between(
-                    ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
+                        ["ACTIVE", r.minval],
-                    index='sites_last_disclaimed')
+                        ["ACTIVE", r.maxval],
-                .order_by(r.desc('claimed'), 'last_disclaimed')
+                        index="sites_last_disclaimed",
                    )
                    .order_by(r.desc("claimed"), "last_disclaimed")
                    .fold(
-                    {}, lambda acc, site: acc.merge(
+                        {},
                        lambda acc, site: acc.merge(
                            r.branch(
-                            site.has_fields('job_id'),
+                                site.has_fields("job_id"),
                                r.object(
-                                site['job_id'].coerce_to('string'),
+                                    site["job_id"].coerce_to("string"),
-                                acc[site['job_id'].coerce_to('string')].default(0).add(1)),
+                                    acc[site["job_id"].coerce_to("string")]
-                            {})),
+                                    .default(0)
                                    .add(1),
                                ),
                                {},
                            )
                        ),
                        emit=lambda acc, site, new_acc: r.branch(
                            r.and_(
                                r.or_(
-                                site['claimed'].not_(),
+                                    site["claimed"].not_(),
-                                site['last_claimed'].lt(r.now().sub(60*60))),
+                                    site["last_claimed"].lt(r.now().sub(60 * 60)),
                                ),
                                r.or_(
-                                site.has_fields('max_claimed_sites').not_(),
+                                    site.has_fields("max_claimed_sites").not_(),
-                                new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
+                                    new_acc[site["job_id"].coerce_to("string")].le(
-                            [site['id']], []))
+                                        site["max_claimed_sites"]
-                .limit(n)))
+                                    ),
                                ),
                            ),
                            [site["id"]],
                            [],
                        ),
                    )
                    .limit(n)
                )
            )
            .update(
                # try to avoid a race condition resulting in multiple
                # brozzler-workers claiming the same site
                # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                r.branch(
                    r.or_(
-                      r.row['claimed'].not_(),
+                        r.row["claimed"].not_(),
-                      r.row['last_claimed'].lt(r.now().sub(60*60))),
+                        r.row["last_claimed"].lt(r.now().sub(60 * 60)),
-                    {'claimed': True, 'last_claimed': r.now()},
+                    ),
-                    {}),
+                    {"claimed": True, "last_claimed": r.now()},
-                return_changes=True)).run()
+                    {},
                ),
                return_changes=True,
            )
        ).run()
        self._vet_result(
-                result, replaced=list(range(n+1)),
+            result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))
-                unchanged=list(range(n+1)))
+        )
        sites = []
        for i in range(result["replaced"]):
            if result["changes"][i]["old_val"]["claimed"]:
@ -145,24 +182,27 @@ class RethinkDbFrontier:
                    "because it was last claimed a long time ago "
                    "at %s, and presumably some error stopped it from "
                    "being disclaimed",
-                        result["changes"][i]["old_val"]["last_claimed"])
+                    result["changes"][i]["old_val"]["last_claimed"],
                )
            site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
            sites.append(site)
-        self.logger.debug('claimed %s sites', len(sites))
+        self.logger.debug("claimed %s sites", len(sites))
        if sites:
            return sites
        else:
            raise brozzler.NothingToClaim
    def enforce_time_limit(self, site):
-        '''
+        """
        Raises `brozzler.ReachedTimeLimit` if appropriate.
-        '''
+        """
-        if (site.time_limit and site.time_limit > 0
+        if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
                and site.elapsed() > site.time_limit):
            self.logger.debug(
-                    "site FINISHED_TIME_LIMIT! time_limit=%s "
+                "site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
-                    "elapsed=%s %s", site.time_limit, site.elapsed(), site)
+                site.time_limit,
                site.elapsed(),
                site,
            )
            raise brozzler.ReachedTimeLimit
    def claim_page(self, site, worker_id):
@ -170,15 +210,20 @@ class RethinkDbFrontier:
        # brozzler-worker can be working on a site at a time, and that would
        # have to be the worker calling this method, so if something is claimed
        # already, it must have been left that way because of some error
-        result = self.rr.table("pages").between(
+        result = (
            self.rr.table("pages")
            .between(
                [site.id, 0, r.minval, r.minval],
                [site.id, 0, r.maxval, r.maxval],
-                index="priority_by_site").order_by(
+                index="priority_by_site",
-                        index=r.desc("priority_by_site")).limit(
+            )
-                                1).update({
+            .order_by(index=r.desc("priority_by_site"))
-                                    "claimed":True,
+            .limit(1)
-                                    "last_claimed_by":worker_id},
+            .update(
-                                    return_changes="always").run()
+                {"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
            )
            .run()
        )
        self._vet_result(result, unchanged=[0, 1], replaced=[0, 1])
        if result["unchanged"] == 0 and result["replaced"] == 0:
            raise brozzler.NothingToClaim
@ -186,10 +231,16 @@ class RethinkDbFrontier:
            return brozzler.Page(self.rr, result["changes"][0]["new_val"])
    def has_outstanding_pages(self, site):
-        results_iter = self.rr.table("pages").between(
+        results_iter = (
            self.rr.table("pages")
            .between(
                [site.id, 0, r.minval, r.minval],
                [site.id, 0, r.maxval, r.maxval],
-                index="priority_by_site").limit(1).run()
+                index="priority_by_site",
            )
            .limit(1)
            .run()
        )
        return len(list(results_iter)) > 0
    def completed_page(self, site, page):
@ -209,15 +260,17 @@ class RethinkDbFrontier:
    def honor_stop_request(self, site):
        """Raises brozzler.CrawlStopped if stop has been requested."""
        site.refresh()
-        if (site.stop_requested
+        if site.stop_requested and site.stop_requested <= doublethink.utcnow():
                and site.stop_requested <= doublethink.utcnow()):
            self.logger.info("stop requested for site %s", site.id)
            raise brozzler.CrawlStopped
        if site.job_id:
            job = brozzler.Job.load(self.rr, site.job_id)
-            if (job and job.stop_requested
+            if (
-                    and job.stop_requested <= doublethink.utcnow()):
+                job
                and job.stop_requested
                and job.stop_requested <= doublethink.utcnow()
            ):
                self.logger.info("stop requested for job %s", site.job_id)
                raise brozzler.CrawlStopped
@ -239,8 +292,7 @@ class RethinkDbFrontier:
                return False
            n += 1
-        self.logger.info(
+        self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
                "all %s sites finished, job %s is FINISHED!", n, job.id)
        job.finish()
        job.save()
        return True
@ -270,13 +322,11 @@ class RethinkDbFrontier:
    def resume_job(self, job):
        job.status = "ACTIVE"
        job.stop_requested = None
-        job.starts_and_stops.append(
+        job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
                {"start":doublethink.utcnow(), "stop":None})
        job.save()
        for site in self.job_sites(job.id):
            site.status = "ACTIVE"
-            site.starts_and_stops.append(
+            site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
                    {"start":doublethink.utcnow(), "stop":None})
            site.save()
    def resume_site(self, site):
@ -285,51 +335,55 @@ class RethinkDbFrontier:
            job = brozzler.Job.load(self.rr, site.job_id)
            job.status = "ACTIVE"
            site.stop_requested = None
-            job.starts_and_stops.append(
+            job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
                    {"start":doublethink.utcnow(), "stop":None})
            job.save()
        site.status = "ACTIVE"
-        site.starts_and_stops.append(
+        site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None})
                {"start":doublethink.utcnow(), "stop":None})
        site.save()
    def _build_fresh_page(self, site, parent_page, url, hops_off=0):
        url_for_scoping = urlcanon.semantic(url)
        url_for_crawling = urlcanon.whatwg(url)
-        hashtag = (url_for_crawling.hash_sign
+        hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
-                   + url_for_crawling.fragment).decode('utf-8')
+            "utf-8"
        )
        urlcanon.canon.remove_fragment(url_for_crawling)
-        page = brozzler.Page(self.rr, {
+        page = brozzler.Page(
-            'url': str(url_for_crawling),
+            self.rr,
-            'site_id': site.id,
+            {
-            'job_id': site.job_id,
+                "url": str(url_for_crawling),
-            'hops_from_seed': parent_page.hops_from_seed + 1,
+                "site_id": site.id,
-            'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
+                "job_id": site.job_id,
-            'via_page_id': parent_page.id,
+                "hops_from_seed": parent_page.hops_from_seed + 1,
-            'via_page_url': parent_page.url,
+                "hop_path": str(parent_page.hop_path if parent_page.hop_path else "")
-            'hops_off_surt': hops_off,
+                + "L",
-            'hashtags': [hashtag] if hashtag else []})
+                "via_page_id": parent_page.id,
                "via_page_url": parent_page.url,
                "hops_off_surt": hops_off,
                "hashtags": [hashtag] if hashtag else [],
            },
        )
        return page
    def _merge_page(self, existing_page, fresh_page):
-        '''
+        """
        Utility method for merging info from `brozzler.Page` instances
        representing the same url but with possibly different metadata.
-        '''
+        """
        existing_page.priority += fresh_page.priority
-        existing_page.hashtags = list(set(
+        existing_page.hashtags = list(
-            (existing_page.hashtags or []) + (fresh_page.hashtags or [])))
+            set((existing_page.hashtags or []) + (fresh_page.hashtags or []))
-        existing_page.hops_off = min(
+        )
-                existing_page.hops_off, fresh_page.hops_off)
+        existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off)
    def _scope_and_enforce_robots(self, site, parent_page, outlinks):
-        '''
+        """
        Returns tuple (
            dict of {page_id: Page} of fresh `brozzler.Page` representing in
                scope links accepted by robots policy,
            set of in scope urls (canonicalized) blocked by robots policy,
            set of out-of-scope urls (canonicalized)).
-        '''
+        """
        pages = {}  # {page_id: Page, ...}
        blocked = set()
        out_of_scope = set()
@ -337,17 +391,18 @@ class RethinkDbFrontier:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            decision = site.accept_reject_or_neither(
-                    url_for_scoping, parent_page=parent_page)
+                url_for_scoping, parent_page=parent_page
            )
            if decision is True:
                hops_off = 0
            elif decision is None:
-                decision = parent_page.hops_off < site.scope.get(
+                decision = parent_page.hops_off < site.scope.get("max_hops_off", 0)
                        'max_hops_off', 0)
                hops_off = parent_page.hops_off + 1
            if decision is True:
                if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                    fresh_page = self._build_fresh_page(
-                            site, parent_page, url, hops_off)
+                        site, parent_page, url, hops_off
                    )
                    if fresh_page.id in pages:
                        self._merge_page(pages[fresh_page.id], fresh_page)
                    else:
@ -359,31 +414,32 @@ class RethinkDbFrontier:
        return pages, blocked, out_of_scope
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
-        decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
+        decisions = {"accepted": set(), "blocked": set(), "rejected": set()}
-        counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
+        counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0}
        fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
-                site, parent_page, outlinks)
+            site, parent_page, outlinks
-        decisions['blocked'] = blocked
+        )
-        decisions['rejected'] = out_of_scope
+        decisions["blocked"] = blocked
-        counts['blocked'] += len(blocked)
+        decisions["rejected"] = out_of_scope
-        counts['rejected'] += len(out_of_scope)
+        counts["blocked"] += len(blocked)
        counts["rejected"] += len(out_of_scope)
        # get existing pages from rethinkdb
-        results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
+        results = self.rr.table("pages").get_all(*fresh_pages.keys()).run()
-        pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
+        pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results}
        # build list of pages to save, consisting of new pages, and existing
        # pages updated with higher priority and new hashtags
        for fresh_page in fresh_pages.values():
-            decisions['accepted'].add(fresh_page.url)
+            decisions["accepted"].add(fresh_page.url)
            if fresh_page.id in pages:
                page = pages[fresh_page.id]
                self._merge_page(page, fresh_page)
-                counts['updated'] += 1
+                counts["updated"] += 1
            else:
                pages[fresh_page.id] = fresh_page
-                counts['added'] += 1
+                counts["added"] += 1
        # make sure we're not stepping on our own toes in case we have a link
        # back to parent_page, which I think happens because of hashtags
@ -398,17 +454,20 @@ class RethinkDbFrontier:
        l = list(pages.values())
        for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
            try:
-                self.logger.debug(
+                self.logger.debug("inserting/replacing batch of %s pages", len(batch))
-                        'inserting/replacing batch of %s pages', len(batch))
+                reql = self.rr.table("pages").insert(batch, conflict="replace")
                reql = self.rr.table('pages').insert(batch, conflict='replace')
                self.logger.trace(
                    'running query self.rr.table("pages").insert(%r, '
-                        'conflict="replace")', batch)
+                    'conflict="replace")',
                    batch,
                )
                result = reql.run()
            except Exception as e:
                self.logger.error(
-                        'problem inserting/replacing batch of %s pages',
+                    "problem inserting/replacing batch of %s pages",
-                        len(batch), exc_info=True)
+                    len(batch),
                    exc_info=True,
                )
        parent_page.outlinks = {}
        for k in decisions:
@ -416,43 +475,56 @@ class RethinkDbFrontier:
        parent_page.save()
        self.logger.info(
-                '%s new links added, %s existing links updated, %s links '
+            "%s new links added, %s existing links updated, %s links "
-                'rejected, %s links blocked by robots from %s',
+            "rejected, %s links blocked by robots from %s",
-                counts['added'], counts['updated'], counts['rejected'],
+            counts["added"],
-                counts['blocked'], parent_page)
+            counts["updated"],
            counts["rejected"],
            counts["blocked"],
            parent_page,
        )
    def reached_limit(self, site, e):
        self.logger.info("reached_limit site=%s e=%s", site, e)
        assert isinstance(e, brozzler.ReachedLimit)
-        if (site.reached_limit
+        if (
-                and site.reached_limit != e.warcprox_meta["reached-limit"]):
+            site.reached_limit
            and site.reached_limit != e.warcprox_meta["reached-limit"]
        ):
            self.logger.warning(
                "reached limit %s but site had already reached limit %s",
-                    e.warcprox_meta["reached-limit"], self.reached_limit)
+                e.warcprox_meta["reached-limit"],
                self.reached_limit,
            )
        else:
            site.reached_limit = e.warcprox_meta["reached-limit"]
            self.finished(site, "FINISHED_REACHED_LIMIT")
    def job_sites(self, job_id):
-        results = self.rr.table('sites').get_all(job_id, index="job_id").run()
+        results = self.rr.table("sites").get_all(job_id, index="job_id").run()
        for result in results:
            yield brozzler.Site(self.rr, result)
    def seed_page(self, site_id):
-        results = self.rr.table("pages").between(
+        results = (
            self.rr.table("pages")
            .between(
                [site_id, r.minval, r.minval, r.minval],
                [site_id, r.maxval, r.maxval, r.maxval],
-                index="priority_by_site").filter({"hops_from_seed":0}).run()
+                index="priority_by_site",
            )
            .filter({"hops_from_seed": 0})
            .run()
        )
        pages = list(results)
        if len(pages) > 1:
-            self.logger.warning(
+            self.logger.warning("more than one seed page for site_id %s ?", site_id)
                    "more than one seed page for site_id %s ?", site_id)
        if len(pages) < 1:
            return None
        return brozzler.Page(self.rr, pages[0])
    def site_pages(self, site_id, brozzled=None):
-        '''
+        """
        Args:
            site_id (str or int):
            brozzled (bool): if true, results include only pages that have
@ -460,16 +532,14 @@ class RethinkDbFrontier:
                not been brozzled; and if None (the default), all pages
        Returns:
            iterator of brozzler.Page
-        '''
+        """
        query = self.rr.table("pages").between(
-                [site_id, 1 if brozzled is True else 0,
+            [site_id, 1 if brozzled is True else 0, r.minval, r.minval],
-                    r.minval, r.minval],
+            [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
-                [site_id, 0 if brozzled is False else r.maxval,
+            index="priority_by_site",
-                    r.maxval, r.maxval],
+        )
                index="priority_by_site")
        self.logger.trace("running query: %r", query)
        results = query.run()
        for result in results:
            self.logger.trace("yielding result: %r", result)
            yield brozzler.Page(self.rr, result)
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/models.py - model classes representing jobs, sites, and pages, with
 related logic
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import brozzler
 import base64
@ -36,15 +36,18 @@ import yaml
 import zlib
 from typing import Optional
 def load_schema():
-    schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
+    schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
    with open(schema_file) as f:
        return yaml.safe_load(f)
 class JobValidator(cerberus.Validator):
    def _validate_type_url(self, value):
        url = urllib.parse.urlparse(value)
-        return url.scheme in ('http', 'https', 'ftp')
+        return url.scheme in ("http", "https", "ftp")
 class InvalidJobConf(Exception):
    def __init__(self, validator):
@ -53,15 +56,17 @@ class InvalidJobConf(Exception):
            # Cerberus does a nice job hiding the bad value. In the case I
            # debugged, I found it here. Maybe there's a better way to see it.
            value = validator._errors[0].info[0][0].info[0][0].value
-            self.errors['bad value'] = value
+            self.errors["bad value"] = value
        except:
            value = None
 def validate_conf(job_conf, schema=load_schema()):
    v = JobValidator(schema)
    if not v.validate(job_conf, normalize=False):
        raise InvalidJobConf(v)
 def merge(a, b):
    if isinstance(a, dict) and isinstance(b, dict):
        merged = dict(a)
@ -75,19 +80,22 @@ def merge(a, b):
    else:
        return a
 def new_job_file(frontier, job_conf_file):
-    '''Returns new Job.'''
+    """Returns new Job."""
    logging.info("loading %s", job_conf_file)
    with open(job_conf_file) as f:
        job_conf = yaml.safe_load(f)
        return new_job(frontier, job_conf)
 def new_job(frontier, job_conf):
-    '''Returns new Job.'''
+    """Returns new Job."""
    validate_conf(job_conf)
-    job = Job(frontier.rr, {
+    job = Job(
-                "conf": job_conf, "status": "ACTIVE",
+        frontier.rr,
-                "started": doublethink.utcnow()})
+        {"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()},
    )
    if "id" in job_conf:
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
@ -109,31 +117,39 @@ def new_job(frontier, job_conf):
    # insert in batches to avoid this error
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
-        logging.info('inserting batch of %s pages', len(batch))
+        logging.info("inserting batch of %s pages", len(batch))
-        result = frontier.rr.table('pages').insert(batch).run()
+        result = frontier.rr.table("pages").insert(batch).run()
    for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
-        logging.info('inserting batch of %s sites', len(batch))
+        logging.info("inserting batch of %s sites", len(batch))
-        result = frontier.rr.table('sites').insert(batch).run()
+        result = frontier.rr.table("sites").insert(batch).run()
-    logging.info('job %s fully started', job.id)
+    logging.info("job %s fully started", job.id)
    return job
 def new_seed_page(frontier, site):
    url = urlcanon.parse_url(site.seed)
    hashtag = (url.hash_sign + url.fragment).decode("utf-8")
    urlcanon.canon.remove_fragment(url)
-    page = brozzler.Page(frontier.rr, {
+    page = brozzler.Page(
        frontier.rr,
        {
            "url": str(url),
            "site_id": site.get("id"),
            "job_id": site.get("job_id"),
            "hops_from_seed": 0,
            "priority": 1000,
            "needs_robots_check": True,
-        "hop_path": None})
+            "hop_path": None,
        },
    )
    if hashtag:
-        page.hashtags = [hashtag,]
+        page.hashtags = [
            hashtag,
        ]
    return page
 def new_site(frontier, site):
    logging.info("new site %s", site)
    site.id = site.id or str(uuid.uuid4())
@ -148,9 +164,10 @@ def new_site(frontier, site):
        # finally block because we want to insert the Site no matter what
        site.save()
 class ElapsedMixIn(object):
    def elapsed(self):
-        '''
+        """
        Returns elapsed crawl time as a float in seconds.
        This metric includes all the time that a site was in active rotation,
@ -158,21 +175,22 @@ class ElapsedMixIn(object):
        In contrast `Site.active_brozzling_time` only counts time when a
        brozzler worker claimed the site and was actively brozzling it.
-        '''
+        """
        dt = 0
        for ss in self.starts_and_stops[:-1]:
-            if ss['stop']:
+            if ss["stop"]:
-                dt += (ss['stop'] - ss['start']).total_seconds()
+                dt += (ss["stop"] - ss["start"]).total_seconds()
            else:
                self.logger.warning("missing expected ss['stop']")
-                dt += (doublethink.utcnow() - ss['start']).total_seconds()
+                dt += (doublethink.utcnow() - ss["start"]).total_seconds()
        ss = self.starts_and_stops[-1]
-        if ss['stop']:
+        if ss["stop"]:
-            dt += (ss['stop'] - ss['start']).total_seconds()
+            dt += (ss["stop"] - ss["start"]).total_seconds()
        else:  # crawl is active
-            dt += (doublethink.utcnow() - ss['start']).total_seconds()
+            dt += (doublethink.utcnow() - ss["start"]).total_seconds()
        return dt
 class Job(doublethink.Document, ElapsedMixIn):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    table = "jobs"
@ -182,28 +200,29 @@ class Job(doublethink.Document, ElapsedMixIn):
            self.status = "ACTIVE"
        if not "starts_and_stops" in self:
            if self.get("started"):  # backward compatibility
-                self.starts_and_stops = [{
+                self.starts_and_stops = [
-                    "start": self.get("started"),
+                    {"start": self.get("started"), "stop": self.get("finished")}
-                    "stop": self.get("finished")}]
+                ]
                del self["started"]
                if "finished" in self:
                    del self["finished"]
            else:
-                self.starts_and_stops = [
+                self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
                        {"start":doublethink.utcnow(),"stop":None}]
    def finish(self):
        if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
            self.logger.error(
-                    "job is already finished status=%s "
+                "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
-                    "starts_and_stops[-1]['stop']=%s", self.status,
+                self.status,
-                    self.starts_and_stops[-1]["stop"])
+                self.starts_and_stops[-1]["stop"],
            )
        self.status = "FINISHED"
        self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
 class Site(doublethink.Document, ElapsedMixIn):
    logger = logging.getLogger(__module__ + "." + __qualname__)
-    table = 'sites'
+    table = "sites"
    def populate_defaults(self):
        if not "status" in self:
@ -225,26 +244,26 @@ class Site(doublethink.Document, ElapsedMixIn):
            del self.scope["surt"]
        # backward compatibility
-        if ("max_hops_off_surt" in self.scope
+        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
                and not "max_hops_off" in self.scope):
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]
        if self.seed:
            self._accept_ssurt_if_not_redundant(
-                    brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
+                brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
            )
        if not "starts_and_stops" in self:
            if self.get("start_time"):  # backward compatibility
-                self.starts_and_stops = [{
+                self.starts_and_stops = [
-                    "start":self.get("start_time"),"stop":None}]
+                    {"start": self.get("start_time"), "stop": None}
                ]
                if self.get("status") != "ACTIVE":
                    self.starts_and_stops[0]["stop"] = self.last_disclaimed
                del self["start_time"]
            else:
-                self.starts_and_stops = [
+                self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}]
                        {"start":doublethink.utcnow(),"stop":None}]
    def __str__(self):
        return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
@ -253,11 +272,12 @@ class Site(doublethink.Document, ElapsedMixIn):
        if not "accepts" in self.scope:
            self.scope["accepts"] = []
        simple_rule_ssurts = (
-            rule["ssurt"] for rule in self.scope["accepts"]
+            rule["ssurt"]
-            if set(rule.keys()) == {'ssurt'})
+            for rule in self.scope["accepts"]
            if set(rule.keys()) == {"ssurt"}
        )
        if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
-            self.logger.info(
+            self.logger.info("adding ssurt %s to scope accept rules", ssurt)
                    "adding ssurt %s to scope accept rules", ssurt)
            self.scope["accepts"].append({"ssurt": ssurt})
    def note_seed_redirect(self, url):
@ -266,14 +286,14 @@ class Site(doublethink.Document, ElapsedMixIn):
        # if http://foo.com/ redirects to https://foo.com/a/b/c let's also
        # put all of https://foo.com/ in scope
-        if (canon_seed_redirect.authority == canon_seed.authority
+        if (
-                and canon_seed_redirect.scheme != canon_seed.scheme):
+            canon_seed_redirect.authority == canon_seed.authority
            and canon_seed_redirect.scheme != canon_seed.scheme
        ):
            canon_seed.scheme = canon_seed_redirect.scheme
-            self._accept_ssurt_if_not_redundant(
+            self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii"))
                    canon_seed.ssurt().decode('ascii'))
-        self._accept_ssurt_if_not_redundant(
+        self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii"))
                canon_seed_redirect.ssurt().decode('ascii'))
    def extra_headers(self, page: Optional["Page"] = None):
        hdrs = {}
@ -281,28 +301,34 @@ class Site(doublethink.Document, ElapsedMixIn):
            temp_warcprox_meta = copy.deepcopy(self.warcprox_meta)
            if "blocks" in self.warcprox_meta:
                # delete temp_warcprox_meta's 'blocks' (they may be big!)
-                del temp_warcprox_meta['blocks']
+                del temp_warcprox_meta["blocks"]
                # str-ify blocks
-                blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':'))
+                blocks_str = json.dumps(
                    self.warcprox_meta["blocks"], separators=(",", ":")
                )
                # encode(), compress, b64encode, decode()
-                temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode()
+                temp_warcprox_meta["compressed_blocks"] = base64.b64encode(
                    zlib.compress(blocks_str.encode())
                ).decode()
            if page is not None:
                temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path
                temp_warcprox_meta["metadata"]["brozzled_url"] = page.url
                temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
-            hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':'))
+            hdrs["Warcprox-Meta"] = json.dumps(
                temp_warcprox_meta, separators=(",", ":")
            )
        return hdrs
    def accept_reject_or_neither(self, url, parent_page=None):
-        '''
+        """
        Returns `True` (accepted), `False` (rejected), or `None` (no decision).
        `None` usually means rejected, unless `max_hops_off` comes into play.
-        '''
+        """
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.semantic(url)
-        if not url.scheme in (b'http', b'https'):
+        if not url.scheme in (b"http", b"https"):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
@ -311,12 +337,14 @@ class Site(doublethink.Document, ElapsedMixIn):
        if parent_page:
            try_parent_urls.append(urlcanon.semantic(parent_page.url))
            if parent_page.redirect_url:
-                try_parent_urls.append(
+                try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url))
                        urlcanon.semantic(parent_page.redirect_url))
        # enforce max_hops
-        if (parent_page and "max_hops" in self.scope
+        if (
-                and parent_page.hops_from_seed >= self.scope["max_hops"]):
+            parent_page
            and "max_hops" in self.scope
            and parent_page.hops_from_seed >= self.scope["max_hops"]
        ):
            return False
        # enforce reject rules
@ -345,6 +373,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        # no decision if we reach here
        return None
 class Page(doublethink.Document):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    table = "pages"
@ -398,4 +427,3 @@ class Page(doublethink.Document):
        if self._canon_hurl is None:
            self._canon_hurl = urlcanon.semantic(self.url)
        return str(self._canon_hurl)
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
 loading from warcs still being written to, canonicalization rules matching
 brozzler conventions, support for screenshot: and thumbnail: urls
@ -16,10 +16,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import sys
 import logging
 try:
    import pywb.apps.cli
    import pywb.cdx.cdxdomainspecific
@ -32,7 +33,9 @@ except ImportError as e:
    logging.critical(
        '%s: %s\n\nYou might need to run "pip install '
        'brozzler[easy]".\nSee README.rst for more information.',
-            type(e).__name__, e)
+        type(e).__name__,
        e,
    )
    sys.exit(1)
 import doublethink
 import rethinkdb as rdb
@ -43,6 +46,7 @@ import argparse
 r = rdb.RethinkDB()
 class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
    def __init__(self, servers, db, table):
        self.servers = servers
@ -67,70 +71,78 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
            # XXX inefficient, it gets parsed later, figure out how to
            # short-circuit this step and create the CDXObject directly
            blob = {
-                'url': record['url'],
+                "url": record["url"],
-                'status': str(record['response_code']),
+                "status": str(record["response_code"]),
-                'digest': record['sha1base32'],
+                "digest": record["sha1base32"],
-                'length': str(record.get('record_length', '-')),
+                "length": str(record.get("record_length", "-")),
-                'offset': str(record['offset']),
+                "offset": str(record["offset"]),
-                'filename': record['filename'],
+                "filename": record["filename"],
            }
-            if record['warc_type'] != 'revisit':
+            if record["warc_type"] != "revisit":
-                blob['mime'] = record['content_type'] or '-'
+                blob["mime"] = record["content_type"] or "-"
            else:
-                blob['mime'] = 'warc/revisit'
+                blob["mime"] = "warc/revisit"
            # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
-            cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
+            cdx_line = "{} {:%Y%m%d%H%M%S} {}".format(
-                    record['canon_surt'], record['timestamp'],
+                record["canon_surt"], record["timestamp"], json.dumps(blob)
-                    json.dumps(blob))
+            )
-            yield cdx_line.encode('utf-8')
+            yield cdx_line.encode("utf-8")
    def _query_rethinkdb(self, cdx_query):
-        start_key = cdx_query.key.decode('utf-8')
+        start_key = cdx_query.key.decode("utf-8")
-        end_key = cdx_query.end_key.decode('utf-8')
+        end_key = cdx_query.end_key.decode("utf-8")
        reql = self.rr.table(self.table).between(
-                [start_key[:150], r.minval], [end_key[:150], r.maxval],
+            [start_key[:150], r.minval],
-                index='abbr_canon_surt_timestamp', right_bound='closed')
+            [end_key[:150], r.maxval],
-        reql = reql.order_by(index='abbr_canon_surt_timestamp')
+            index="abbr_canon_surt_timestamp",
            right_bound="closed",
        )
        reql = reql.order_by(index="abbr_canon_surt_timestamp")
        # TODO support for POST, etc
        # http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
        reql = reql.filter(
-                lambda capture: r.expr(
+            lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains(
-                    ['WARCPROX_WRITE_RECORD','GET']).contains(
+                capture["http_method"]
-                        capture['http_method']))
+            )
        )
        reql = reql.filter(
-                lambda capture: (capture['canon_surt'] >= start_key)
+            lambda capture: (capture["canon_surt"] >= start_key)
-                                 & (capture['canon_surt'] < end_key))
+            & (capture["canon_surt"] < end_key)
        )
        if cdx_query.limit:
            reql = reql.limit(cdx_query.limit)
-        logging.debug('rethinkdb query: %s', reql)
+        logging.debug("rethinkdb query: %s", reql)
        results = reql.run()
        return results
 class TheGoodUrlCanonicalizer(object):
-    '''
+    """
    Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
    surts with scheme and with trailing comma, and does not "massage"
    www.foo.org into foo.org.
-    '''
+    """
    def __init__(self, surt_ordered=True):
-        '''We are always surt ordered (surt_ordered param is ignored)'''
+        """We are always surt ordered (surt_ordered param is ignored)"""
        self.surt_ordered = True
    def __call__(self, url):
        try:
-            key = urlcanon.semantic(url).surt().decode('ascii')
+            key = urlcanon.semantic(url).surt().decode("ascii")
            # logging.debug('%s -> %s', url, key)
            return key
        except Exception as e:
            return url
    def replace_default_canonicalizer():
-        '''Replace parent class of CustomUrlCanonicalizer with this class.'''
+        """Replace parent class of CustomUrlCanonicalizer with this class."""
        pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
-                TheGoodUrlCanonicalizer,)
+            TheGoodUrlCanonicalizer,
        )
    def good_surts_from_default(default_surt):
-        '''
+        """
        Takes a standard surt without scheme and without trailing comma, and
        returns a list of "good" surts that together match the same set of
        urls. For example:
@ -144,59 +156,64 @@ class TheGoodUrlCanonicalizer(object):
             'http://(com,example,www,)/path',
             'https://(com,example,www,)/path']
-        '''
+        """
-        if default_surt == '':
+        if default_surt == "":
-            return ['']
+            return [""]
-        parts = default_surt.split(')', 1)
+        parts = default_surt.split(")", 1)
        if len(parts) == 2:
            orig_host_part, path_part = parts
            good_surts = [
-                'http://(%s,)%s' % (orig_host_part, path_part),
+                "http://(%s,)%s" % (orig_host_part, path_part),
-                'https://(%s,)%s' % (orig_host_part, path_part),
+                "https://(%s,)%s" % (orig_host_part, path_part),
-                'http://(%s,www,)%s' % (orig_host_part, path_part),
+                "http://(%s,www,)%s" % (orig_host_part, path_part),
-                'https://(%s,www,)%s' % (orig_host_part, path_part),
+                "https://(%s,www,)%s" % (orig_host_part, path_part),
            ]
        else:  # no path part
            host_part = parts[0]
            good_surts = [
-                'http://(%s' % host_part,
+                "http://(%s" % host_part,
-                'https://(%s' % host_part,
+                "https://(%s" % host_part,
            ]
        return good_surts
    def monkey_patch_dsrules_init():
        orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
        def cdx_dsrule_init(self, url_prefix, rules):
            good_surts = []
-            url_prefixes = [url_prefix] if isinstance(
+            url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix
                    url_prefix, str) else url_prefix
            for bad_surt in url_prefixes:
                good_surts.extend(
-                        TheGoodUrlCanonicalizer.good_surts_from_default(
+                    TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt)
-                                bad_surt))
+                )
-            if 'match' in rules and 'regex' in rules['match']:
+            if "match" in rules and "regex" in rules["match"]:
-                rules['match']['regex'] = r'https?://\(' + rules['match']['regex']
+                rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"]
            orig_init(self, good_surts, rules)
        pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
 def support_in_progress_warcs():
-    '''
+    """
    Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
    being written to (warcs having ".open" suffix). This way if a cdx entry
    references foo.warc.gz, pywb will try both foo.warc.gz and
    foo.warc.gz.open.
-    '''
+    """
    _orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
    def _prefix_resolver_call(self, filename, cdx=None):
        raw_results = _orig_prefix_resolver_call(self, filename, cdx)
        results = []
        for warc_path in raw_results:
            results.append(warc_path)
-            results.append('%s.open' % warc_path)
+            results.append("%s.open" % warc_path)
        return results
    pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
 class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
    def __init__(self, orig_url):
        import re
@ -211,14 +228,14 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
        pywb.rewrite.wburl.BaseWbUrl.__init__(self)
        if six.PY2 and isinstance(orig_url, six.text_type):
-            orig_url = orig_url.encode('utf-8')
+            orig_url = orig_url.encode("utf-8")
            orig_url = quote(orig_url)
        self._original_url = orig_url
        if not self._init_query(orig_url):
            if not self._init_replay(orig_url):
-                raise Exception('Invalid WbUrl: ', orig_url)
+                raise Exception("Invalid WbUrl: ", orig_url)
        new_uri = WbUrl.to_uri(self.url)
@ -227,8 +244,11 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
        self.url = new_uri
        # begin brozzler changes
-        if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
+        if (
-                or self.url.startswith('thumbnail:')):
+            self.url.startswith("urn:")
            or self.url.startswith("screenshot:")
            or self.url.startswith("thumbnail:")
        ):
            return
        # end brozzler changes
@ -253,27 +273,31 @@ class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
            self.url = self.DEFAULT_SCHEME + self.url
        else:
            inx += 2
-            if inx < len(self.url) and self.url[inx] != '/':
+            if inx < len(self.url) and self.url[inx] != "/":
-                self.url = self.url[:inx] + '/' + self.url[inx:]
+                self.url = self.url[:inx] + "/" + self.url[inx:]
 def _get_wburl_type(self):
    return SomeWbUrl
 def monkey_patch_wburl():
    pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
 class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
    def _extend_parser(self, arg_parser):
        super()._extend_parser(arg_parser)
        arg_parser._actions[4].help = argparse.SUPPRESS  # --autoindex
        arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
-        arg_parser.epilog = '''
+        arg_parser.epilog = """
 Run pywb like so:
    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
 See README.rst for more information.
-'''
+"""
 # copied and pasted from cdxdomainspecific.py, only changes are commented as
 # such below
@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query):
    matched_rule = None
-    urlkey = to_native_str(query.key, 'utf-8')
+    urlkey = to_native_str(query.key, "utf-8")
    url = query.url
    filter_ = query.filters
    output = query.output
@ -306,7 +330,7 @@ def _fuzzy_query_call(self, query):
    if not matched_rule:
        return None
-    repl = '?'
+    repl = "?"
    if matched_rule.replace:
        repl = matched_rule.replace
@ -315,33 +339,33 @@ def _fuzzy_query_call(self, query):
        url = url[: inx + len(repl)]
    # begin brozzler changes
-    if matched_rule.match_type == 'domain':
+    if matched_rule.match_type == "domain":
        orig_split_url = urlsplit(url)
        # remove the subdomain, path, query and fragment
-        host = orig_split_url.netloc.split('.', 1)[1]
+        host = orig_split_url.netloc.split(".", 1)[1]
-        new_split_url = (orig_split_url.scheme, host, '', '', '')
+        new_split_url = (orig_split_url.scheme, host, "", "", "")
        url = urlunsplit(new_split_url)
    # end brozzler changes
    params = query.params
-    params.update({'url': url,
+    params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_})
                   'matchType': matched_rule.match_type,
                   'filter': filter_})
-    if 'reverse' in params:
+    if "reverse" in params:
-        del params['reverse']
+        del params["reverse"]
-    if 'closest' in params:
+    if "closest" in params:
-        del params['closest']
+        del params["closest"]
-    if 'end_key' in params:
+    if "end_key" in params:
-        del params['end_key']
+        del params["end_key"]
    return params
 def monkey_patch_fuzzy_query():
    pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
 # copied and pasted from pywb/utils/canonicalize.py, only changes are commented
 # as such
 def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
@ -361,54 +385,56 @@ def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
    start_key = url_canon(url)
-    if match_type == 'exact':
+    if match_type == "exact":
-        end_key = start_key + '!'
+        end_key = start_key + "!"
-    elif match_type == 'prefix':
+    elif match_type == "prefix":
        # add trailing slash if url has it
-        if url.endswith('/') and not start_key.endswith('/'):
+        if url.endswith("/") and not start_key.endswith("/"):
-            start_key += '/'
+            start_key += "/"
        end_key = inc_last_char(start_key)
-    elif match_type == 'host':
+    elif match_type == "host":
        if surt_ordered:
-            host = start_key.split(')/')[0]
+            host = start_key.split(")/")[0]
-            start_key = host + ')/'
+            start_key = host + ")/"
-            end_key = host + '*'
+            end_key = host + "*"
        else:
            host = urlparse.urlsplit(url).netloc
-            start_key = host + '/'
+            start_key = host + "/"
-            end_key = host + '0'
+            end_key = host + "0"
-    elif match_type == 'domain':
+    elif match_type == "domain":
        if not surt_ordered:
-            msg = 'matchType=domain unsupported for non-surt'
+            msg = "matchType=domain unsupported for non-surt"
            raise UrlCanonicalizeException(msg)
-        host = start_key.split(')/')[0]
+        host = start_key.split(")/")[0]
        # if tld, use com, as start_key
        # otherwise, stick with com,example)/
-        if ',' not in host:
+        if "," not in host:
-            start_key = host + ','
+            start_key = host + ","
        else:
-            start_key = host + ')/'
+            start_key = host + ")/"
        # begin brozzler changes
-        end_key = host + '~'
+        end_key = host + "~"
        # end brozzler changes
    else:
-        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
+        raise UrlCanonicalizeException("Invalid match_type: " + match_type)
    return (start_key, end_key)
 def monkey_patch_calc_search_range():
    pywb.utils.canonicalize.calc_search_range = _calc_search_range
    pywb.cdx.query.calc_search_range = _calc_search_range
 def main(argv=sys.argv):
    brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
    brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
@ -417,7 +443,10 @@ def main(argv=sys.argv):
    brozzler.pywb.monkey_patch_fuzzy_query()
    brozzler.pywb.monkey_patch_calc_search_range()
    wayback_cli = BrozzlerWaybackCli(
-            args=argv[1:], default_port=8880,
+        args=argv[1:],
-            desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
+        default_port=8880,
-                  'with brozzler)'))
+        desc=(
            "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
        ),
    )
    wayback_cli.run()
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/robots.py - robots.txt support
 Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
@ -20,7 +20,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import json
 import logging
@ -34,30 +34,40 @@ __all__ = ["is_permitted_by_robots"]
 # monkey-patch reppy to do substring user-agent matching, see top of file
 reppy.Utility.short_user_agent = lambda strng: strng
 def _reppy_rules_getitem(self, agent):
-    '''
+    """
    Find the user-agent token matching the supplied full user-agent, using
    a case-insensitive substring search.
-    '''
+    """
    lc_agent = agent.lower()
    for s in self.agents:
        if s in lc_agent:
            return self.agents[s]
-    return self.agents.get('*')
+    return self.agents.get("*")
 reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
 class _SessionRaiseOn420(requests.Session):
    timeout = 60
    def get(self, url, *args, **kwargs):
        res = super().get(url, timeout=self.timeout, *args, **kwargs)
-        if res.status_code == 420 and 'warcprox-meta' in res.headers:
+        if res.status_code == 420 and "warcprox-meta" in res.headers:
            raise brozzler.ReachedLimit(
-                    warcprox_meta=json.loads(res.headers['warcprox-meta']),
+                warcprox_meta=json.loads(res.headers["warcprox-meta"]),
-                    http_payload=res.text)
+                http_payload=res.text,
            )
        else:
            return res
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site, proxy=None):
    if not site.id in _robots_caches:
        req_sesh = _SessionRaiseOn420()
@ -68,14 +78,16 @@ def _robots_cache(site, proxy=None):
        if site.extra_headers():
            req_sesh.headers.update(site.extra_headers())
        if site.user_agent:
-            req_sesh.headers['User-Agent'] = site.user_agent
+            req_sesh.headers["User-Agent"] = site.user_agent
        _robots_caches[site.id] = reppy.cache.RobotsCache(
-                session=req_sesh, disallow_forbidden=False)
+            session=req_sesh, disallow_forbidden=False
        )
    return _robots_caches[site.id]
 def is_permitted_by_robots(site, url, proxy=None):
-    '''
+    """
    Checks if `url` is permitted by robots.txt.
    Treats any kind of error fetching robots.txt as "allow all". See
@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None):
    Raises:
        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
        requests.exceptions.ProxyError: if the proxy is down
-    '''
+    """
    if site.ignore_robots:
        return True
    try:
-        result = _robots_cache(site, proxy).allowed(
+        result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler")
                url, site.user_agent or "brozzler")
        return result
    except Exception as e:
        if isinstance(e, reppy.exceptions.ServerError) and isinstance(
-                e.args[0], brozzler.ReachedLimit):
+            e.args[0], brozzler.ReachedLimit
        ):
            raise e.args[0]
-        elif hasattr(e, 'args') and isinstance(
+        elif hasattr(e, "args") and isinstance(
-                e.args[0], requests.exceptions.ProxyError):
+            e.args[0], requests.exceptions.ProxyError
        ):
            # reppy has wrapped an exception that we want to bubble up
            raise brozzler.ProxyError(e)
        else:
            logging.warning(
                "returning true (permitted) after problem fetching "
-                    "robots.txt for %r: %r", url, e)
+                "robots.txt for %r: %r",
                url,
                e,
            )
            return True
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
 it runs yt-dlp on them, browses them and runs behaviors if appropriate,
 scopes and adds outlinks to the frontier
@ -16,7 +16,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import logging
 import brozzler
@ -39,6 +39,7 @@ from . import ydl
 r = rdb.RethinkDB()
 class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -50,13 +51,26 @@ class BrozzlerWorker:
    SITE_SESSION_MINUTES = 15
    def __init__(
-            self, frontier, service_registry=None, max_browsers=1,
+        self,
-            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
+        frontier,
-            skip_extract_outlinks=False, skip_visit_hashtags=False,
+        service_registry=None,
-            skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
+        max_browsers=1,
-            page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
+        chrome_exe="chromium-browser",
-            download_throughput=-1, stealth=False,
+        warcprox_auto=False,
-            window_height=900, window_width=1400):
+        proxy=None,
        skip_extract_outlinks=False,
        skip_visit_hashtags=False,
        skip_youtube_dl=False,
        simpler404=False,
        screenshot_full_page=False,
        page_timeout=300,
        behavior_timeout=900,
        extract_outlinks_timeout=60,
        download_throughput=-1,
        stealth=False,
        window_height=900,
        window_width=1400,
    ):
        self._frontier = frontier
        self._service_registry = service_registry
        self._max_browsers = max_browsers
@ -79,7 +93,8 @@ class BrozzlerWorker:
        self._stealth = stealth
        self._browser_pool = brozzler.browser.BrowserPool(
-                max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
+            max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True
        )
        self._browsing_threads = set()
        self._browsing_threads_lock = threading.Lock()
@ -88,13 +103,20 @@ class BrozzlerWorker:
        self._shutdown = threading.Event()
    def _choose_warcprox(self):
-        warcproxes = self._service_registry.available_services('warcprox')
+        warcproxes = self._service_registry.available_services("warcprox")
        if not warcproxes:
            return None
        # .group('proxy').count() makes this query about 99% more efficient
-        reql = self._frontier.rr.table('sites').between(
+        reql = (
-                ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
+            self._frontier.rr.table("sites")
-                index='sites_last_disclaimed').group('proxy').count()
+            .between(
                ["ACTIVE", r.minval],
                ["ACTIVE", r.maxval],
                index="sites_last_disclaimed",
            )
            .group("proxy")
            .count()
        )
        # returns results like
        # {
        #    "wbgrp-svc030.us.archive.org:8000": 148,
@ -102,10 +124,11 @@ class BrozzlerWorker:
        # }
        proxy_scoreboard = dict(reql.run())
        for warcprox in warcproxes:
-            address = '%s:%s' % (warcprox['host'], warcprox['port'])
+            address = "%s:%s" % (warcprox["host"], warcprox["port"])
-            warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
+            warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0)
-        warcproxes.sort(key=lambda warcprox: (
+        warcproxes.sort(
-            warcprox['assigned_sites'], warcprox['load']))
+            key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"])
        )
        # XXX make this heuristic more advanced?
        return warcproxes[0]
@ -118,13 +141,15 @@ class BrozzlerWorker:
            svc = self._choose_warcprox()
            if svc is None:
                raise brozzler.ProxyError(
-                        'no available instances of warcprox in the service '
+                    "no available instances of warcprox in the service " "registry"
-                        'registry')
+                )
-            site.proxy = '%s:%s' % (svc['host'], svc['port'])
+            site.proxy = "%s:%s" % (svc["host"], svc["port"])
            site.save()
            self.logger.info(
-                    'chose warcprox instance %r from service registry for %r',
+                "chose warcprox instance %r from service registry for %r",
-                    site.proxy, site)
+                site.proxy,
                site,
            )
            return site.proxy
        return None
@ -132,14 +157,16 @@ class BrozzlerWorker:
        if self._proxy:
            if self._proxy_is_warcprox is None:
                try:
-                    response = requests.get('http://%s/status' % self._proxy)
+                    response = requests.get("http://%s/status" % self._proxy)
                    status = json.loads(response.text)
-                    self._proxy_is_warcprox = (status['role'] == 'warcprox')
+                    self._proxy_is_warcprox = status["role"] == "warcprox"
                except Exception as e:
                    self._proxy_is_warcprox = False
                logging.info(
-                        '%s %s warcprox', self._proxy,
+                    "%s %s warcprox",
-                        'IS' if self._proxy_is_warcprox else 'IS NOT')
+                    self._proxy,
                    "IS" if self._proxy_is_warcprox else "IS NOT",
                )
            return self._proxy_is_warcprox
        else:
            # I should have commented when I originally wrote this code, but I
@ -148,13 +175,20 @@ class BrozzlerWorker:
            return bool(site.proxy or self._warcprox_auto)
    def _warcprox_write_record(
-            self, warcprox_address, url, warc_type, content_type,
+        self,
-            payload, extra_headers=None):
+        warcprox_address,
        url,
        warc_type,
        content_type,
        payload,
        extra_headers=None,
    ):
        headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
        if extra_headers:
            headers.update(extra_headers)
-        request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
+        request = urllib.request.Request(
-                headers=headers, data=payload)
+            url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload
        )
        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
@ -166,25 +200,30 @@ class BrozzlerWorker:
                if response.getcode() != 204:
                    self.logger.warning(
                        'got "%s %s" response on warcprox '
-                            'WARCPROX_WRITE_RECORD request (expected 204)',
+                        "WARCPROX_WRITE_RECORD request (expected 204)",
-                            response.getcode(), response.reason)
+                        response.getcode(),
                        response.reason,
                    )
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warning(
                'got "%s %s" response on warcprox '
-                    'WARCPROX_WRITE_RECORD request (expected 204)',
+                "WARCPROX_WRITE_RECORD request (expected 204)",
-                    e.getcode(), e.info())
+                e.getcode(),
                e.info(),
            )
            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
-                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
+                "proxy error on WARCPROX_WRITE_RECORD %s" % url
            ) from e
        except ConnectionError as e:
            raise brozzler.ProxyError(
-                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
+                "proxy error on WARCPROX_WRITE_RECORD %s" % url
            ) from e
    def thumb_jpeg(self, full_jpeg):
-        """Create JPEG thumbnail.
+        """Create JPEG thumbnail."""
        """
        img = PIL.Image.open(io.BytesIO(full_jpeg))
        thumb_width = 300
        thumb_height = (thumb_width / img.size[0]) * img.size[1]
@ -193,8 +232,15 @@ class BrozzlerWorker:
        img.save(out, "jpeg", quality=95)
        return out.getbuffer()
-    def brozzle_page(self, browser, site, page, on_screenshot=None,
+    def brozzle_page(
-                     on_request=None, enable_youtube_dl=True):
+        self,
        browser,
        site,
        page,
        on_screenshot=None,
        on_request=None,
        enable_youtube_dl=True,
    ):
        self.logger.info("brozzling {}".format(page))
        ydl_fetches = None
        outlinks = set()
@ -208,31 +254,38 @@ class BrozzlerWorker:
            except brozzler.ProxyError:
                raise
            except Exception as e:
-                if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
+                if (
-                        and hasattr(e.exc_info[1], 'code')
+                    hasattr(e, "exc_info")
-                        and e.exc_info[1].code == 430):
+                    and len(e.exc_info) >= 2
                    and hasattr(e.exc_info[1], "code")
                    and e.exc_info[1].code == 430
                ):
                    self.logger.info(
-                            'youtube-dl got %s %s processing %s',
+                        "youtube-dl got %s %s processing %s",
-                            e.exc_info[1].code, e.exc_info[1].msg, page.url)
+                        e.exc_info[1].code,
                        e.exc_info[1].msg,
                        page.url,
                    )
                else:
                    self.logger.error(
-                            'youtube_dl raised exception on %s', page,
+                        "youtube_dl raised exception on %s", page, exc_info=True
-                            exc_info=True)
+                    )
        if self._needs_browsing(page, ydl_fetches):
-            self.logger.info('needs browsing: %s', page)
+            self.logger.info("needs browsing: %s", page)
            try:
                browser_outlinks = self._browse_page(
-                    browser, site, page, on_screenshot, on_request)
+                    browser, site, page, on_screenshot, on_request
                )
                outlinks.update(browser_outlinks)
            except brozzler.PageInterstitialShown:
-                self.logger.info('page interstitial shown (http auth): %s', page)
+                self.logger.info("page interstitial shown (http auth): %s", page)
        else:
            if not self._already_fetched(page, ydl_fetches):
-                self.logger.info('needs fetch: %s', page)
+                self.logger.info("needs fetch: %s", page)
                self._fetch_url(site, page=page)
            else:
-                self.logger.info('already fetched: %s', page)
+                self.logger.info("already fetched: %s", page)
        return outlinks
@ -243,71 +296,88 @@ class BrozzlerWorker:
            if self._using_warcprox(site):
                self.logger.info(
                    "sending WARCPROX_WRITE_RECORD request to %s with "
-                        "screenshot for %s", self._proxy_for(site), page)
+                    "screenshot for %s",
                    self._proxy_for(site),
                    page,
                )
                thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="screenshot:%s" % str(urlcanon.semantic(page.url)),
-                        warc_type="resource", content_type="image/jpeg",
+                    warc_type="resource",
                    content_type="image/jpeg",
                    payload=screenshot_jpeg,
-                        extra_headers=site.extra_headers(page))
+                    extra_headers=site.extra_headers(page),
                )
                self._warcprox_write_record(
                    warcprox_address=self._proxy_for(site),
                    url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
-                        warc_type="resource", content_type="image/jpeg",
+                    warc_type="resource",
                    content_type="image/jpeg",
                    payload=thumbnail_jpeg,
-                        extra_headers=site.extra_headers(page))
+                    extra_headers=site.extra_headers(page),
                )
        def _on_response(chrome_msg):
-            if ('params' in chrome_msg
+            if (
-                    and 'response' in chrome_msg['params']
+                "params" in chrome_msg
-                    and 'mimeType' in chrome_msg['params']['response']
+                and "response" in chrome_msg["params"]
-                    and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
+                and "mimeType" in chrome_msg["params"]["response"]
                and chrome_msg["params"]["response"]
                .get("mimeType", "")
                .startswith("video/")
                # skip manifests of DASH segmented video -
                # see https://github.com/internetarchive/brozzler/pull/70
-                    and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd'
+                and chrome_msg["params"]["response"]["mimeType"]
-                    and chrome_msg['params']['response'].get('status') in (200, 206)):
+                != "video/vnd.mpeg.dash.mpd"
                and chrome_msg["params"]["response"].get("status") in (200, 206)
            ):
                video = {
-                    'blame': 'browser',
+                    "blame": "browser",
-                    'url': chrome_msg['params']['response'].get('url'),
+                    "url": chrome_msg["params"]["response"].get("url"),
-                    'response_code': chrome_msg['params']['response']['status'],
+                    "response_code": chrome_msg["params"]["response"]["status"],
-                    'content-type': chrome_msg['params']['response']['mimeType'],
+                    "content-type": chrome_msg["params"]["response"]["mimeType"],
                }
                response_headers = CaseInsensitiveDict(
-                        chrome_msg['params']['response']['headers'])
+                    chrome_msg["params"]["response"]["headers"]
-                if 'content-length' in response_headers:
+                )
-                    video['content-length'] = int(response_headers['content-length'])
+                if "content-length" in response_headers:
-                if 'content-range' in response_headers:
+                    video["content-length"] = int(response_headers["content-length"])
-                    video['content-range'] = response_headers['content-range']
+                if "content-range" in response_headers:
-                logging.debug('embedded video %s', video)
+                    video["content-range"] = response_headers["content-range"]
-                if not 'videos' in page:
+                logging.debug("embedded video %s", video)
                if not "videos" in page:
                    page.videos = []
                page.videos.append(video)
        sw_fetched = set()
        def _on_service_worker_version_updated(chrome_msg):
            # https://github.com/internetarchive/brozzler/issues/140
-            self.logger.trace('%r', chrome_msg)
+            self.logger.trace("%r", chrome_msg)
-            if chrome_msg.get('params', {}).get('versions'):
+            if chrome_msg.get("params", {}).get("versions"):
-                url = chrome_msg.get('params', {}).get('versions')[0]\
+                url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
                        .get('scriptURL')
                if url and url not in sw_fetched:
-                    self.logger.info('fetching service worker script %s', url)
+                    self.logger.info("fetching service worker script %s", url)
                    self._fetch_url(site, url=url)
                    sw_fetched.add(url)
        if not browser.is_running():
            browser.start(
                proxy=self._proxy_for(site),
-                    cookie_db=site.get('cookie_db'),
+                cookie_db=site.get("cookie_db"),
                window_height=self._window_height,
-                    window_width=self._window_width)
+                window_width=self._window_width,
            )
        final_page_url, outlinks = browser.browse_page(
-                page.url, extra_headers=site.extra_headers(page),
+            page.url,
-                behavior_parameters=site.get('behavior_parameters'),
+            extra_headers=site.extra_headers(page),
-                username=site.get('username'), password=site.get('password'),
+            behavior_parameters=site.get("behavior_parameters"),
-                user_agent=site.get('user_agent'),
+            username=site.get("username"),
-                on_screenshot=_on_screenshot, on_response=_on_response,
+            password=site.get("password"),
            user_agent=site.get("user_agent"),
            on_screenshot=_on_screenshot,
            on_response=_on_response,
            on_request=on_request,
            on_service_worker_version_updated=_on_service_worker_version_updated,
            hashtags=page.hashtags,
@ -320,7 +390,8 @@ class BrozzlerWorker:
            behavior_timeout=self._behavior_timeout,
            extract_outlinks_timeout=self._extract_outlinks_timeout,
            download_throughput=self._download_throughput,
-                stealth=self._stealth)
+            stealth=self._stealth,
        )
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
@ -331,19 +402,18 @@ class BrozzlerWorker:
            url = page.url
        if self._proxy_for(site):
            proxies = {
-                'http': 'http://%s' % self._proxy_for(site),
+                "http": "http://%s" % self._proxy_for(site),
-                'https': 'http://%s' % self._proxy_for(site),
+                "https": "http://%s" % self._proxy_for(site),
            }
-        self.logger.info('fetching %s', url)
+        self.logger.info("fetching %s", url)
        try:
            # response is ignored
            requests.get(
-                    url, proxies=proxies, headers=site.extra_headers(page),
+                url, proxies=proxies, headers=site.extra_headers(page), verify=False
-                    verify=False)
+            )
        except requests.exceptions.ProxyError as e:
-            raise brozzler.ProxyError(
+            raise brozzler.ProxyError("proxy error fetching %s" % url) from e
                    'proxy error fetching %s' % url) from e
    def _needs_browsing(self, page, ydl_fetches):
        if ydl_fetches:
@ -351,8 +421,10 @@ class BrozzlerWorker:
            if not final_bounces:
                return True
            for txn in final_bounces:
-                if txn['response_headers'].get_content_type() in [
+                if txn["response_headers"].get_content_type() in [
-                        'text/html', 'application/xhtml+xml']:
+                    "text/html",
                    "application/xhtml+xml",
                ]:
                    return True
            return False
        else:
@ -361,14 +433,13 @@ class BrozzlerWorker:
    def _already_fetched(self, page, ydl_fetches):
        if ydl_fetches:
            for fetch in ydl.final_bounces(ydl_fetches, page.url):
-                if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
+                if fetch["method"] == "GET" and fetch["response_code"] == 200:
                    return True
        return False
    def brozzle_site(self, browser, site):
        try:
-            site.last_claimed_by = '%s:%s' % (
+            site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
                    socket.gethostname(), browser.chrome.port)
            site.save()
            start = time.time()
            page = None
@ -377,28 +448,28 @@ class BrozzlerWorker:
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
            self.logger.info(
-                    "brozzling site (proxy=%r) %s",
+                "brozzling site (proxy=%r) %s", self._proxy_for(site), site
-                    self._proxy_for(site), site)
+            )
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
                self._frontier.enforce_time_limit(site)
                self._frontier.honor_stop_request(site)
-                page = self._frontier.claim_page(site, "%s:%s" % (
+                page = self._frontier.claim_page(
-                    socket.gethostname(), browser.chrome.port))
+                    site, "%s:%s" % (socket.gethostname(), browser.chrome.port)
                )
-                if (page.needs_robots_check and
+                if page.needs_robots_check and not brozzler.is_permitted_by_robots(
-                        not brozzler.is_permitted_by_robots(
+                    site, page.url, self._proxy_for(site)
-                            site, page.url, self._proxy_for(site))):
+                ):
                    logging.warning("page %s is blocked by robots.txt", page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
                    outlinks = self.brozzle_page(
-                            browser, site, page,
+                        browser, site, page, enable_youtube_dl=not self._skip_youtube_dl
-                            enable_youtube_dl=not self._skip_youtube_dl)
+                    )
                    self._frontier.completed_page(site, page)
-                    self._frontier.scope_and_schedule_outlinks(
+                    self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
                            site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db()
@ -418,31 +489,36 @@ class BrozzlerWorker:
        except brozzler.ProxyError as e:
            if self._warcprox_auto:
                logging.error(
-                        'proxy error (site.proxy=%s), will try to choose a '
+                    "proxy error (site.proxy=%s), will try to choose a "
-                        'healthy instance next time site is brozzled: %s',
+                    "healthy instance next time site is brozzled: %s",
-                        site.proxy, e)
+                    site.proxy,
                    e,
                )
                site.proxy = None
            else:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
-                logging.error(
+                logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
                        'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
        except:
            self.logger.error(
-                    'unexpected exception site=%r page=%r', site, page,
+                "unexpected exception site=%r page=%r", site, page, exc_info=True
-                    exc_info=True)
+            )
            if page:
                page.failed_attempts = (page.failed_attempts or 0) + 1
                if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
                    self.logger.info(
                        'marking page "completed" after %s unexpected '
-                            'exceptions attempting to brozzle %s',
+                        "exceptions attempting to brozzle %s",
-                            page.failed_attempts, page)
+                        page.failed_attempts,
                        page,
                    )
                    self._frontier.completed_page(site, page)
                    page = None
        finally:
            if start:
-                site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
+                site.active_brozzling_time = (
                    (site.active_brozzling_time or 0) + time.time() - start
                )
            self._frontier.disclaim_site(site, page)
    def _brozzle_site_thread_target(self, browser, site):
@ -462,21 +538,25 @@ class BrozzlerWorker:
                "role": "brozzler-worker",
                "ttl": self.HEARTBEAT_INTERVAL * 3,
            }
-        status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
+        status_info["load"] = (
            1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
        )
        status_info["browser_pool_size"] = self._browser_pool.size
        status_info["browsers_in_use"] = self._browser_pool.num_in_use()
        try:
            self.status_info = self._service_registry.heartbeat(status_info)
-            self.logger.trace(
+            self.logger.trace("status in service registry: %s", self.status_info)
                    "status in service registry: %s", self.status_info)
        except r.ReqlError as e:
            self.logger.error(
                "failed to send heartbeat and update service registry "
-                    "with info %s: %s", status_info, e)
+                "with info %s: %s",
                status_info,
                e,
            )
    def _service_heartbeat_if_due(self):
-        '''Sends service registry heartbeat if due'''
+        """Sends service registry heartbeat if due"""
        due = False
        if self._service_registry:
            if not hasattr(self, "status_info"):
@ -489,15 +569,16 @@ class BrozzlerWorker:
            self._service_heartbeat()
    def _start_browsing_some_sites(self):
-        '''
+        """
        Starts browsing some sites.
        Raises:
            NoBrowsersAvailable if none available
-        '''
+        """
        # acquire_multi() raises NoBrowsersAvailable if none available
        browsers = self._browser_pool.acquire_multi(
-                (self._browser_pool.num_available() + 1) // 2)
+            (self._browser_pool.num_available() + 1) // 2
        )
        try:
            sites = self._frontier.claim_sites(len(browsers))
        except:
@ -510,7 +591,8 @@ class BrozzlerWorker:
                    target=self._brozzle_site_thread_target,
                    args=(browsers[i], sites[i]),
                    name="BrozzlingThread:%s" % browsers[i].chrome.port,
-                        daemon=True)
+                    daemon=True,
                )
                with self._browsing_threads_lock:
                    self._browsing_threads.add(th)
                th.start()
@ -519,7 +601,8 @@ class BrozzlerWorker:
    def run(self):
        self.logger.notice(
-                'brozzler %s - brozzler-worker starting', brozzler.__version__)
+            "brozzler %s - brozzler-worker starting", brozzler.__version__
        )
        last_nothing_to_claim = 0
        try:
            while not self._shutdown.is_set():
@ -528,39 +611,38 @@ class BrozzlerWorker:
                    try:
                        self._start_browsing_some_sites()
                    except brozzler.browser.NoBrowsersAvailable:
-                        logging.trace(
+                        logging.trace("all %s browsers are in use", self._max_browsers)
                                "all %s browsers are in use",
                                self._max_browsers)
                    except brozzler.NothingToClaim:
                        last_nothing_to_claim = time.time()
                        logging.trace(
                            "nothing to claim, all available active sites "
-                                "are already claimed by a brozzler worker")
+                            "are already claimed by a brozzler worker"
                        )
                time.sleep(0.5)
            self.logger.notice("shutdown requested")
        except r.ReqlError as e:
            self.logger.error(
-                    "caught rethinkdb exception, will try to proceed",
+                "caught rethinkdb exception, will try to proceed", exc_info=True
-                    exc_info=True)
+            )
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except:
            self.logger.critical(
-                    "thread exiting due to unexpected exception",
+                "thread exiting due to unexpected exception", exc_info=True
-                    exc_info=True)
+            )
        finally:
            if self._service_registry and hasattr(self, "status_info"):
                try:
                    self._service_registry.unregister(self.status_info["id"])
                except:
                    self.logger.error(
-                            "failed to unregister from service registry",
+                        "failed to unregister from service registry", exc_info=True
-                            exc_info=True)
+                    )
            self.logger.info(
-                    'shutting down %s brozzling threads',
+                "shutting down %s brozzling threads", len(self._browsing_threads)
-                    len(self._browsing_threads))
+            )
            with self._browsing_threads_lock:
                for th in self._browsing_threads:
                    if th.is_alive():
@ -575,11 +657,10 @@ class BrozzlerWorker:
        with self._start_stop_lock:
            if self._thread:
                self.logger.warning(
-                        'ignoring start request because self._thread is '
+                    "ignoring start request because self._thread is " "not None"
-                        'not None')
+                )
                return
-            self._thread = threading.Thread(
+            self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
                    target=self.run, name="BrozzlerWorker")
            self._thread.start()
    def shutdown_now(self):
@ -590,4 +671,3 @@ class BrozzlerWorker:
    def is_alive(self):
        return self._thread and self._thread.is_alive()
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -1,4 +1,4 @@
-'''
+"""
 brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
 Copyright (C) 2023 Internet Archive
@ -14,7 +14,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import logging
 import yt_dlp
@ -31,6 +31,7 @@ import threading
 thread_local = threading.local()
 class ExtraHeaderAdder(urllib.request.BaseHandler):
    def __init__(self, extra_headers):
        self.extra_headers = extra_headers
@ -43,6 +44,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
                req.add_header(h, v)
        return req
 class YoutubeDLSpy(urllib.request.BaseHandler):
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -51,10 +53,10 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
    def _http_response(self, request, response):
        fetch = {
-            'url': request.full_url,
+            "url": request.full_url,
-            'method': request.get_method(),
+            "method": request.get_method(),
-            'response_code': response.code,
+            "response_code": response.code,
-            'response_headers': response.headers,
+            "response_headers": response.headers,
        }
        self.fetches.append(fetch)
        return response
@ -64,6 +66,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
    def reset(self):
        self.fetches = []
 def final_bounces(fetches, url):
    """
    Resolves redirect chains in `fetches` and returns a list of fetches
@ -75,24 +78,26 @@ def final_bounces(fetches, url):
    for fetch in fetches:
        # XXX check http status 301,302,303,307? check for "uri" header
        # as well as "location"? see urllib.request.HTTPRedirectHandler
-         if 'location' in fetch['response_headers']:
+        if "location" in fetch["response_headers"]:
-             redirects[fetch['url']] = fetch
+            redirects[fetch["url"]] = fetch
    final_url = url
    while final_url in redirects:
        fetch = redirects.pop(final_url)
        final_url = urllib.parse.urljoin(
-                fetch['url'], fetch['response_headers']['location'])
+            fetch["url"], fetch["response_headers"]["location"]
        )
    final_bounces = []
    for fetch in fetches:
-        if fetch['url'] == final_url:
+        if fetch["url"] == final_url:
            final_bounces.append(fetch)
    return final_bounces
 def _build_youtube_dl(worker, destdir, site, page):
-    '''
+    """
    Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
    The `YoutubeDL` instance does a few special brozzler-specific things:
@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page):
    Returns:
        a yt-dlp `yt_dlp.YoutubeDL` instance
-    '''
+    """
    class _YoutubeDL(yt_dlp.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)
@ -117,31 +122,38 @@ def _build_youtube_dl(worker, destdir, site, page):
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
-            if ie_result.get('_type') == 'playlist':
+            if ie_result.get("_type") == "playlist":
-                self.logger.info(
+                self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
-                        'extractor %r found playlist in %s', ie.IE_NAME, url)
+                if ie.IE_NAME in {
-                if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
+                    "youtube:playlist",
                    "youtube:tab",
                    "soundcloud:user",
                    "instagram:user",
                }:
                    # At this point ie_result['entries'] is an iterator that
                    # will fetch more metadata from youtube to list all the
                    # videos. We unroll that iterator here partly because
                    # otherwise `process_ie_result()` will clobber it, and we
                    # use it later to extract the watch pages as outlinks.
                    try:
-                        ie_result['entries_no_dl'] = list(ie_result['entries'])
+                        ie_result["entries_no_dl"] = list(ie_result["entries"])
                    except Exception as e:
                        self.logger.warning(
                            "failed to unroll ie_result['entries']? for %s, %s; exception %s",
-                                ie.IE_NAME, url, e)
+                            ie.IE_NAME,
-                        ie_result['entries_no_dl'] =[]
+                            url,
-                    ie_result['entries'] = []
+                            e,
                        )
                        ie_result["entries_no_dl"] = []
                    ie_result["entries"] = []
                    self.logger.info(
-                            'not downloading %s media files from this '
+                        "not downloading %s media files from this "
-                            'playlist because we expect to capture them from '
+                        "playlist because we expect to capture them from "
-                            'individual watch/track/detail pages',
+                        "individual watch/track/detail pages",
-                            len(ie_result['entries_no_dl']))
+                        len(ie_result["entries_no_dl"]),
                    )
            else:
-                self.logger.info(
+                self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
                        'extractor %r found a download in %s', ie.IE_NAME, url)
        def _push_video_to_warcprox(self, site, info_dict, postprocessor):
            # 220211 update: does yt-dlp supply content-type? no, not as such
@ -150,73 +162,96 @@ def _build_youtube_dl(worker, destdir, site, page):
            # youtube-dl produces a stitched-up video that /usr/bin/file fails
            # to identify (says "application/octet-stream"). `ffprobe` doesn't
            # give us a mimetype.
-            if info_dict.get('ext') == 'mp4':
+            if info_dict.get("ext") == "mp4":
-                mimetype = 'video/mp4'
+                mimetype = "video/mp4"
            else:
                try:
                    import magic
-                    mimetype = magic.from_file(info_dict['filepath'], mime=True)
+
                    mimetype = magic.from_file(info_dict["filepath"], mime=True)
                except ImportError as e:
-                    mimetype = 'video/%s' % info_dict['ext']
+                    mimetype = "video/%s" % info_dict["ext"]
-                    self.logger.warning(
+                    self.logger.warning("guessing mimetype %s because %r", mimetype, e)
                            'guessing mimetype %s because %r', mimetype, e)
            # youtube watch page postprocessor is MoveFiles
-            if postprocessor == 'FixupM3u8' or postprocessor == 'Merger':
+            if postprocessor == "FixupM3u8" or postprocessor == "Merger":
-                url = 'youtube-dl:%05d:%s' % (
+                url = "youtube-dl:%05d:%s" % (
-                       info_dict.get('playlist_index') or 1,
+                    info_dict.get("playlist_index") or 1,
-                       info_dict['webpage_url'])
+                    info_dict["webpage_url"],
                )
            else:
-                url = info_dict.get('url', '')
+                url = info_dict.get("url", "")
            # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
-            if url.endswith('.m3u8') or url == '':
+            if url.endswith(".m3u8") or url == "":
                return
-            size = os.path.getsize(info_dict['filepath'])
+            size = os.path.getsize(info_dict["filepath"])
            self.logger.info(
-                    'pushing %r video as %s (%s bytes) to '
+                "pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
-                    'warcprox at %s with url %s', info_dict['format'],
+                info_dict["format"],
-                    mimetype, size, worker._proxy_for(site), url)
+                mimetype,
-            with open(info_dict['filepath'], 'rb') as f:
+                size,
                worker._proxy_for(site),
                url,
            )
            with open(info_dict["filepath"], "rb") as f:
                # include content-length header to avoid chunked
                # transfer, which warcprox currently rejects
                extra_headers = dict(site.extra_headers())
-                extra_headers['content-length'] = size
+                extra_headers["content-length"] = size
                request, response = worker._warcprox_write_record(
-                        warcprox_address=worker._proxy_for(site), url=url,
+                    warcprox_address=worker._proxy_for(site),
-                        warc_type='resource', content_type=mimetype, payload=f,
+                    url=url,
-                        extra_headers=extra_headers)
+                    warc_type="resource",
                    content_type=mimetype,
                    payload=f,
                    extra_headers=extra_headers,
                )
                # consulted by _remember_videos()
-                ydl.pushed_videos.append({
+                ydl.pushed_videos.append(
-                    'url': url,
+                    {
-                    'response_code': response.code,
+                        "url": url,
-                    'content-type': mimetype,
+                        "response_code": response.code,
-                    'content-length': size,
+                        "content-type": mimetype,
-                })
+                        "content-length": size,
                    }
                )
    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
        # in case yt-dlp takes a long time, heartbeat site.last_claimed
        # to prevent another brozzler-worker from claiming the site
        try:
-            if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
+            if (
                site.rr
                and doublethink.utcnow() - site.last_claimed
                > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES)
            ):
                worker.logger.debug(
-                        'heartbeating site.last_claimed to prevent another '
+                    "heartbeating site.last_claimed to prevent another "
-                        'brozzler-worker claiming this site id=%r', site.id)
+                    "brozzler-worker claiming this site id=%r",
                    site.id,
                )
                site.last_claimed = doublethink.utcnow()
                site.save()
        except:
            worker.logger.debug(
-                    'problem heartbeating site.last_claimed site id=%r',
+                "problem heartbeating site.last_claimed site id=%r",
-                    site.id, exc_info=True)
+                site.id,
                exc_info=True,
            )
    def ydl_postprocess_hook(d):
-        if d['status'] == 'finished':
+        if d["status"] == "finished":
-            worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
+            worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
-            worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
+            worker.logger.info(
                "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
            )
            if worker._using_warcprox(site):
-                _YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
+                _YoutubeDL._push_video_to_warcprox(
                    _YoutubeDL, site, d["info_dict"], d["postprocessor"]
                )
    # default socket_timeout is 20 -- we hit it often when cluster is busy
    ydl_opts = {
@ -230,7 +265,6 @@ def _build_youtube_dl(worker, destdir, site, page):
        "socket_timeout": 40,
        "progress_hooks": [maybe_heartbeat_site_last_claimed],
        "postprocessor_hooks": [ydl_postprocess_hook],
        # https://github.com/yt-dlp/yt-dlp#format-selection
        # "By default, yt-dlp tries to download the best available quality..."
        # pre-v.2023.07.06: "format_sort": ["ext"],
@ -241,13 +275,10 @@ def _build_youtube_dl(worker, destdir, site, page):
        "format_sort": ["res:720", "vcodec:h264", "acodec:aac"],
        # skip live streams
        "match_filter": match_filter_func("!is_live"),
-
+        "extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
        "extractor_args": {'youtube': {'skip': ['dash', 'hls']}},
        # --cache-dir local or..
        # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
        "cache_dir": "/home/archiveit",
        "logger": logging.getLogger("yt_dlp"),
        "verbose": False,
        "quiet": False,
@ -265,49 +296,53 @@ def _build_youtube_dl(worker, destdir, site, page):
    ydl._opener.add_handler(ydl.fetch_spy)
    return ydl
 def _remember_videos(page, fetches, pushed_videos=None):
-    '''
+    """
    Saves info about videos captured by yt-dlp in `page.videos`.
-    '''
+    """
-    if not 'videos' in page:
+    if not "videos" in page:
        page.videos = []
    for fetch in fetches or []:
-        content_type = fetch['response_headers'].get_content_type()
+        content_type = fetch["response_headers"].get_content_type()
-        if (content_type.startswith('video/')
+        if (
            content_type.startswith("video/")
            # skip manifests of DASH segmented video -
            # see https://github.com/internetarchive/brozzler/pull/70
-                and content_type != 'video/vnd.mpeg.dash.mpd'
+            and content_type != "video/vnd.mpeg.dash.mpd"
-                and fetch['method'] == 'GET'
+            and fetch["method"] == "GET"
-                and fetch['response_code'] in (200, 206)):
+            and fetch["response_code"] in (200, 206)
        ):
            video = {
-                'blame': 'youtube-dl',
+                "blame": "youtube-dl",
-                'url': fetch['url'],
+                "url": fetch["url"],
-                'response_code': fetch['response_code'],
+                "response_code": fetch["response_code"],
-                'content-type': content_type,
+                "content-type": content_type,
            }
-            if 'content-length' in fetch['response_headers']:
+            if "content-length" in fetch["response_headers"]:
-                video['content-length'] = int(
+                video["content-length"] = int(
-                        fetch['response_headers']['content-length'])
+                    fetch["response_headers"]["content-length"]
-            if 'content-range' in fetch['response_headers']:
+                )
            if "content-range" in fetch["response_headers"]:
                # skip chunked youtube video
-                if 'googlevideo.com/videoplayback' in fetch['url']:
+                if "googlevideo.com/videoplayback" in fetch["url"]:
                    continue
-                video['content-range'] = fetch[
+                video["content-range"] = fetch["response_headers"]["content-range"]
-                        'response_headers']['content-range']
+            logging.debug("embedded video %s", video)
            logging.debug('embedded video %s', video)
            page.videos.append(video)
    for pushed_video in pushed_videos or []:
-        if pushed_video['content-type'].startswith('video/'):
+        if pushed_video["content-type"].startswith("video/"):
            video = {
-                'blame': 'youtube-dl',
+                "blame": "youtube-dl",
-                'url': pushed_video['url'],
+                "url": pushed_video["url"],
-                'response_code': pushed_video['response_code'],
+                "response_code": pushed_video["response_code"],
-                'content-type': pushed_video['content-type'],
+                "content-type": pushed_video["content-type"],
-                'content-length': pushed_video['content-length'],
+                "content-length": pushed_video["content-length"],
            }
-            logging.debug('embedded video %s', video)
+            logging.debug("embedded video %s", video)
            page.videos.append(video)
 def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying yt-dlp on %s", page)
@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page):
            # no host given>" resulting in ProxyError
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
-            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
+            ie_result = ydl.sanitize_info(
                ydl.extract_info(str(urlcanon.whatwg(page.url)))
            )
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                "sending WARCPROX_WRITE_RECORD request to warcprox "
-                    "with yt-dlp json for %s", page)
+                "with yt-dlp json for %s",
                page,
            )
            worker._warcprox_write_record(
                warcprox_address=worker._proxy_for(site),
                url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                warc_type="metadata",
                content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                payload=info_json.encode("utf-8"),
-                    extra_headers=site.extra_headers(page))
+                extra_headers=site.extra_headers(page),
            )
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
            return None
-        elif (hasattr(e, "exc_info")
+        elif (
            hasattr(e, "exc_info")
            and e.exc_info[0] == urllib.error.HTTPError
            and hasattr(e.exc_info[1], "code")
-                and e.exc_info[1].code == 420):
+            and e.exc_info[1].code == 420
        ):
            raise brozzler.ReachedLimit(e.exc_info[1])
-        elif (hasattr(e, 'exc_info')
+        elif (
            hasattr(e, "exc_info")
            and e.exc_info[0] == urllib.error.URLError
-                and worker._proxy_for(site)):
+            and worker._proxy_for(site)
        ):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
-                    'yt-dlp hit apparent proxy error from '
+                "yt-dlp hit apparent proxy error from " "%s" % page.url
-                    '%s' % page.url) from e
+            ) from e
        else:
            raise
 def do_youtube_dl(worker, site, page):
-    '''
+    """
    Runs yt-dlp configured for `worker` and `site` to download videos from
    `page`.
@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page):
                    'response_headers': ...,
                }, ...]
            `list` of `str`: outlink urls
-    '''
+    """
-    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+    with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
        ydl = _build_youtube_dl(worker, tempdir, site, page)
        ie_result = _try_youtube_dl(worker, ydl, site, page)
        outlinks = set()
-        if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or
+        if ie_result and (
-                          ie_result.get('extractor') == 'youtube:tab'):
+            ie_result.get("extractor") == "youtube:playlist"
            or ie_result.get("extractor") == "youtube:tab"
        ):
            # youtube watch pages as outlinks
-            outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
+            outlinks = {
-                        for e in ie_result.get('entries_no_dl', [])}
+                "https://www.youtube.com/watch?v=%s" % e["id"]
                for e in ie_result.get("entries_no_dl", [])
            }
        # any outlinks for other cases?
        return ydl.fetch_spy.fetches, outlinks
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 setup.py - brozzler setup script
 Copyright (C) 2014-2024 Internet Archive
@ -15,88 +15,88 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import setuptools
 import os
 def find_package_data(package):
    pkg_data = []
-    depth = len(package.split('.'))
+    depth = len(package.split("."))
-    path = os.path.join(*package.split('.'))
+    path = os.path.join(*package.split("."))
    for dirpath, dirnames, filenames in os.walk(path):
-        if not os.path.exists(os.path.join(dirpath, '__init__.py')):
+        if not os.path.exists(os.path.join(dirpath, "__init__.py")):
            relpath = os.path.join(*dirpath.split(os.sep)[depth:])
            pkg_data.extend(os.path.join(relpath, f) for f in filenames)
    return pkg_data
 setuptools.setup(
-        name='brozzler',
+    name="brozzler",
-        version='1.5.45a0',
+    version="1.5.45a1",
-        description='Distributed web crawling with browsers',
+    description="Distributed web crawling with browsers",
-        url='https://github.com/internetarchive/brozzler',
+    url="https://github.com/internetarchive/brozzler",
-        author='Noah Levitt',
+    author="Noah Levitt",
-        author_email='nlevitt@archive.org',
+    author_email="nlevitt@archive.org",
-        long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
+    long_description=open("README.rst", mode="rb").read().decode("UTF-8"),
-        license='Apache License 2.0',
+    license="Apache License 2.0",
-        packages=['brozzler', 'brozzler.dashboard'],
+    packages=["brozzler", "brozzler.dashboard"],
    package_data={
-            'brozzler': [
+        "brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"],
-                'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
+        "brozzler.dashboard": find_package_data("brozzler.dashboard"),
            'brozzler.dashboard': find_package_data('brozzler.dashboard'),
    },
    entry_points={
-            'console_scripts': [
+        "console_scripts": [
-                'brozzle-page=brozzler.cli:brozzle_page',
+            "brozzle-page=brozzler.cli:brozzle_page",
-                'brozzler-new-job=brozzler.cli:brozzler_new_job',
+            "brozzler-new-job=brozzler.cli:brozzler_new_job",
-                'brozzler-new-site=brozzler.cli:brozzler_new_site',
+            "brozzler-new-site=brozzler.cli:brozzler_new_site",
-                'brozzler-worker=brozzler.cli:brozzler_worker',
+            "brozzler-worker=brozzler.cli:brozzler_worker",
-                'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
+            "brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
-                'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
+            "brozzler-list-captures=brozzler.cli:brozzler_list_captures",
-                'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs',
+            "brozzler-list-jobs=brozzler.cli:brozzler_list_jobs",
-                'brozzler-list-sites=brozzler.cli:brozzler_list_sites',
+            "brozzler-list-sites=brozzler.cli:brozzler_list_sites",
-                'brozzler-list-pages=brozzler.cli:brozzler_list_pages',
+            "brozzler-list-pages=brozzler.cli:brozzler_list_pages",
-                'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl',
+            "brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl",
-                'brozzler-purge=brozzler.cli:brozzler_purge',
+            "brozzler-purge=brozzler.cli:brozzler_purge",
-                'brozzler-dashboard=brozzler.dashboard:main',
+            "brozzler-dashboard=brozzler.dashboard:main",
-                'brozzler-easy=brozzler.easy:main',
+            "brozzler-easy=brozzler.easy:main",
-                'brozzler-wayback=brozzler.pywb:main',
+            "brozzler-wayback=brozzler.pywb:main",
        ],
    },
    install_requires=[
-            'PyYAML>=5.1',
+        "PyYAML>=5.1",
-            'reppy==0.3.4',
+        "yt_dlp<2023.11.16",
-            'requests>=2.21',
+        "reppy==0.3.4",
-            'websocket-client>=0.39.0,<=0.48.0',
+        "requests>=2.21",
-            'pillow>=5.2.0',
+        "websocket-client>=0.39.0,<=0.48.0",
-            'urlcanon>=0.1.dev23',
+        "pillow>=5.2.0",
-            'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311',
+        "urlcanon>=0.1.dev23",
-            'rethinkdb==2.4.9',
+        "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
-            'cerberus>=1.0.1',
+        "rethinkdb<2.4.10",
-            'jinja2>=2.10',
+        "cerberus>=1.0.1",
-            'cryptography>=2.3',
+        "jinja2>=2.10",
-            'python-magic>=0.4.15',
+        "cryptography>=2.3",
        "python-magic>=0.4.15",
    ],
    extras_require={
-            'dashboard': [
+        "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"],
-                'flask>=1.0',
+        "easy": [
-                'gunicorn>=19.8.1'
+            "warcprox>=2.4.31",
-            ],
+            "pywb>=0.33.2,<2",
-            'easy': [
+            "flask>=1.0",
-                'warcprox>=2.4.31',
+            "gunicorn>=19.8.1",
                'pywb>=0.33.2,<2',
                'flask>=1.0',
                'gunicorn>=19.8.1'
        ],
    },
    zip_safe=False,
    classifiers=[
-            'Development Status :: 5 - Production/Stable',
+        "Development Status :: 5 - Production/Stable",
-            'Environment :: Console',
+        "Environment :: Console",
-            'License :: OSI Approved :: Apache Software License',
+        "License :: OSI Approved :: Apache Software License",
-            'Programming Language :: Python :: 3.8',
+        "Programming Language :: Python :: 3.5",
-            'Programming Language :: Python :: 3.9',
+        "Programming Language :: Python :: 3.6",
-            'Programming Language :: Python :: 3.10',
+        "Programming Language :: Python :: 3.7",
-            'Topic :: Internet :: WWW/HTTP',
+        "Topic :: Internet :: WWW/HTTP",
-            'Topic :: System :: Archiving',
+        "Topic :: System :: Archiving",
-        ])
+    ],
 )
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 test_brozzling.py - XXX explain
 Copyright (C) 2016-2018 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import pytest
 import brozzler
@ -34,79 +34,81 @@ args.log_level = logging.INFO
 brozzler.cli.configure_logging(args)
 WARCPROX_META_420 = {
-    'stats': {
+    "stats": {
-        'test_limits_bucket': {
+        "test_limits_bucket": {
-            'total': {'urls': 0, 'wire_bytes': 0},
+            "total": {"urls": 0, "wire_bytes": 0},
-            'new': {'urls': 0, 'wire_bytes': 0},
+            "new": {"urls": 0, "wire_bytes": 0},
-            'revisit': {'urls': 0, 'wire_bytes': 0},
+            "revisit": {"urls": 0, "wire_bytes": 0},
-            'bucket': 'test_limits_bucket'
+            "bucket": "test_limits_bucket",
        }
    },
-    'reached-limit': {'test_limits_bucket/total/urls': 0}
+    "reached-limit": {"test_limits_bucket/total/urls": 0},
 }
-@pytest.fixture(scope='module')
+
@pytest.fixture(scope="module")
 def httpd(request):
    class RequestHandler(http.server.SimpleHTTPRequestHandler):
        def __init__(self, *args, **kwargs):
-            self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd'
+            self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
            http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
        def do_GET(self):
-            if self.path == '/420':
+            if self.path == "/420":
-                self.send_response(420, 'Reached limit')
+                self.send_response(420, "Reached limit")
-                self.send_header('Connection', 'close')
+                self.send_header("Connection", "close")
-                self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
+                self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
-                payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
+                payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
-                self.send_header('Content-Type', 'text/plain;charset=utf-8')
+                self.send_header("Content-Type", "text/plain;charset=utf-8")
-                self.send_header('Content-Length', len(payload))
+                self.send_header("Content-Length", len(payload))
                self.end_headers()
                self.wfile.write(payload)
-            elif self.path == '/401':
+            elif self.path == "/401":
                self.send_response(401)
-                self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
+                self.send_header("WWW-Authenticate", 'Basic realm="Test"')
-                self.send_header('Content-type', 'text/html')
+                self.send_header("Content-type", "text/html")
                self.end_headers()
-                self.wfile.write(self.headers.get('Authorization', b''))
+                self.wfile.write(self.headers.get("Authorization", b""))
-                self.wfile.write(b'not authenticated')
+                self.wfile.write(b"not authenticated")
            else:
                super().do_GET()
        def do_POST(self):
-            if self.path == '/login-action':
+            if self.path == "/login-action":
                self.send_response(200)
-                payload = b'login successful\n'
+                payload = b"login successful\n"
-                self.send_header('Content-Type', 'text/plain;charset=utf-8')
+                self.send_header("Content-Type", "text/plain;charset=utf-8")
-                self.send_header('Content-Length', len(payload))
+                self.send_header("Content-Length", len(payload))
                self.end_headers()
                self.wfile.write(payload)
            else:
                super().do_POST()
    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
-    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+    os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
-    httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
+    httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()
    def fin():
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
    request.addfinalizer(fin)
    return httpd
 def test_httpd(httpd):
-    '''
+    """
    Tests that our http server is working as expected, and that two fetches
    of the same url return the same payload, proving it can be used to test
    deduplication.
-    '''
+    """
    payload1 = content2 = None
-    url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
+    url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
        payload1 = response.read()
@ -119,123 +121,136 @@ def test_httpd(httpd):
    assert payload1 == payload2
-    url = 'http://localhost:%s/420' % httpd.server_port
+    url = "http://localhost:%s/420" % httpd.server_port
    with pytest.raises(urllib.error.HTTPError) as excinfo:
        urllib.request.urlopen(url)
    assert excinfo.value.getcode() == 420
 def test_aw_snap_hes_dead_jim():
    chrome_exe = brozzler.suggest_default_chrome_exe()
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.BrowsingException):
-            browser.browse_page('chrome://crash')
+            browser.browse_page("chrome://crash")
 # chromium's 401 handling changed???
@pytest.mark.xfail
 def test_page_interstitial_exception(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/401' % httpd.server_port
+    url = "http://localhost:%s/401" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.PageInterstitialShown):
            browser.browse_page(url)
 def test_on_response(httpd):
    response_urls = []
    def on_response(msg):
-        response_urls.append(msg['params']['response']['url'])
+        response_urls.append(msg["params"]["response"]["url"])
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/site3/page.html' % httpd.server_port
+    url = "http://localhost:%s/site3/page.html" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(url, on_response=on_response)
-    assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
+    assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
-    assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
+    assert (
-    assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
+        response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
    )
    assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
 def test_420(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/420' % httpd.server_port
+    url = "http://localhost:%s/420" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.ReachedLimit) as excinfo:
            browser.browse_page(url)
        assert excinfo.value.warcprox_meta == WARCPROX_META_420
 def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
+    url = "http://localhost:%s/site4/alert.html" % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
        browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
        browser.browse_page(
-                'http://localhost:%s/site4/alert.html' % httpd.server_port)
+            "http://localhost:%s/site4/confirm.html" % httpd.server_port
-        browser.browse_page(
+        )
-                'http://localhost:%s/site4/confirm.html' % httpd.server_port)
+        browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
        browser.browse_page(
                'http://localhost:%s/site4/prompt.html' % httpd.server_port)
        # XXX print dialog unresolved
        # browser.browse_page(
        #         'http://localhost:%s/site4/print.html' % httpd.server_port)
 def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
-    page = brozzler.Page(None, {
+    page = brozzler.Page(
-        'url':'http://localhost:%s/site6/' % httpd.server_port})
+        None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
    )
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 4
    assert page.videos[0] == {
-        'blame': 'youtube-dl',
+        "blame": "youtube-dl",
-        'response_code': 200,
+        "response_code": 200,
-        'content-length': 383631,
+        "content-length": 383631,
-        'content-type': 'video/mp4',
+        "content-type": "video/mp4",
-        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
+        "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
    }
    assert page.videos[1] == {
-        'blame': 'youtube-dl',
+        "blame": "youtube-dl",
-        'content-length': 92728,
+        "content-length": 92728,
-        'content-type': 'video/webm',
+        "content-type": "video/webm",
-        'response_code': 200,
+        "response_code": 200,
-        'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port
+        "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
        % httpd.server_port,
    }
    assert page.videos[2] == {
-        'blame': 'youtube-dl',
+        "blame": "youtube-dl",
-        'content-length': 101114,
+        "content-length": 101114,
-        'content-type': 'video/webm',
+        "content-type": "video/webm",
-        'response_code': 200,
+        "response_code": 200,
-        'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port
+        "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
    }
    assert page.videos[3] == {
-        'blame': 'browser',
+        "blame": "browser",
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
-        'response_code': 200,
+        "response_code": 200,
-        'content-length': 229455,
+        "content-length": 229455,
-        'content-type': 'video/webm',
+        "content-type": "video/webm",
-        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
+        "url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
    }
 def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
-    page = brozzler.Page(None, {
+    page = brozzler.Page(
-        'url':'http://localhost:%s/site8/' % httpd.server_port})
+        None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
    )
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
-        'http://example.com/offsite',
+        "http://example.com/offsite",
-        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
+        "http://localhost:%s/site8/baz/zuh" % httpd.server_port,
-        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
+        "http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
-        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
+        "http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
    }
 def test_proxy_down():
-    '''
+    """
    Test that browsing raises `brozzler.ProxyError` when proxy is down.
    See also `test_proxy_down` in test_units.py.
@ -243,40 +258,41 @@ def test_proxy_down():
    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
-    '''
+    """
    sock = socket.socket()
-    sock.bind(('127.0.0.1', 0))
+    sock.bind(("127.0.0.1", 0))
-    for not_listening_proxy in (
+    for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
-            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
+        site = brozzler.Site(None, {"seed": "http://example.com/"})
-        site = brozzler.Site(None, {'seed':'http://example.com/'})
+        page = brozzler.Page(None, {"url": "http://example.com/"})
        page = brozzler.Page(None, {'url': 'http://example.com/'})
-        worker = brozzler.BrozzlerWorker(
+        worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
                frontier=None, proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()
        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
 def test_try_login(httpd):
-    """Test try_login behavior.
+    """Test try_login behavior."""
    """
    response_urls = []
    def on_response(msg):
-        response_urls.append(msg['params']['response']['url'])
+        response_urls.append(msg["params"]["response"]["url"])
    chrome_exe = brozzler.suggest_default_chrome_exe()
-    form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
+    form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
-    form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port
+    form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
-    favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
+    favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
-    login_url = 'http://localhost:%s/login-action' % httpd.server_port
+    login_url = "http://localhost:%s/login-action" % httpd.server_port
    # When username and password are defined and initial page has login form,
    # detect login form, submit login, and then return to the initial page.
-    username = 'user1'
+    username = "user1"
-    password = 'pass1'
+    password = "pass1"
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        browser.browse_page(form_url, username=username, password=password,
+        browser.browse_page(
-                            on_response=on_response)
+            form_url, username=username, password=password, on_response=on_response
        )
    assert len(response_urls) == 4
    assert response_urls[0] == form_url
    assert response_urls[1] == favicon_url
@ -285,11 +301,15 @@ def test_try_login(httpd):
    # We are now supporting a different type of form, we'll test that here.
    response_urls = []
-    username = 'user1'
+    username = "user1"
-    password = 'pass1'
+    password = "pass1"
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        browser.browse_page(form_url_other, username=username, password=password,
+        browser.browse_page(
-                            on_response=on_response)
+            form_url_other,
            username=username,
            password=password,
            on_response=on_response,
        )
    assert len(response_urls) == 4
    assert response_urls[0] == form_url_other
    assert response_urls[1] == favicon_url
@ -306,10 +326,16 @@ def test_try_login(httpd):
    # when the page doesn't have a form with username/password, don't submit it
    response_urls = []
-    form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
+    form_without_login_url = (
        "http://localhost:%s/site11/form-no-login.html" % httpd.server_port
    )
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        browser.browse_page(form_without_login_url, username=username,
+        browser.browse_page(
-                            password=password, on_response=on_response)
+            form_without_login_url,
            username=username,
            password=password,
            on_response=on_response,
        )
    assert len(response_urls) == 2
    assert response_urls[0] == form_without_login_url
    assert response_urls[1] == favicon_url
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 test_cli.py - test brozzler commands
 Copyright (C) 2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import brozzler.cli
 import pkg_resources
@ -23,59 +23,62 @@ import pytest
 import subprocess
 import doublethink
 def cli_commands():
-    commands = set(pkg_resources.get_entry_map(
+    commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys())
-        'brozzler')['console_scripts'].keys())
+    commands.remove("brozzler-wayback")
    commands.remove('brozzler-wayback')
    try:
        import gunicorn
    except ImportError:
-        commands.remove('brozzler-dashboard')
+        commands.remove("brozzler-dashboard")
    try:
        import pywb
    except ImportError:
-        commands.remove('brozzler-easy')
+        commands.remove("brozzler-easy")
    return commands
-@pytest.mark.parametrize('cmd', cli_commands())
+
@pytest.mark.parametrize("cmd", cli_commands())
 def test_call_entrypoint(capsys, cmd):
-    entrypoint = pkg_resources.get_entry_map(
+    entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd]
            'brozzler')['console_scripts'][cmd]
    callable = entrypoint.resolve()
    with pytest.raises(SystemExit):
-        callable(['/whatever/bin/%s' % cmd, '--version'])
+        callable(["/whatever/bin/%s" % cmd, "--version"])
    out, err = capsys.readouterr()
-    assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd)
+    assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd)
-    assert err == ''
+    assert err == ""
-@pytest.mark.parametrize('cmd', cli_commands())
+
@pytest.mark.parametrize("cmd", cli_commands())
 def test_run_command(capsys, cmd):
    proc = subprocess.Popen(
-        [cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        [cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    out, err = proc.communicate()
-    assert err == b''
+    assert err == b""
-    assert out == ('brozzler %s - %s\n' % (
+    assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
-        brozzler.__version__, cmd)).encode('ascii')
+
 def test_rethinkdb_up():
-    '''Check that rethinkdb is up and running.'''
+    """Check that rethinkdb is up and running."""
    # check that rethinkdb is listening and looks sane
-    rr = doublethink.Rethinker(db='rethinkdb')  # built-in db
+    rr = doublethink.Rethinker(db="rethinkdb")  # built-in db
    tbls = rr.table_list().run()
    assert len(tbls) > 10
 # XXX don't know why this test is failing in travis-ci and vagrant while
 # test_call_entrypoint tests pass :( (also fails with capfd)
@pytest.mark.xfail
 def test_stop_nonexistent_crawl(capsys):
    with pytest.raises(SystemExit):
-        brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123'])
+        brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"])
    out, err = capsys.readouterr()
-    assert err.endswith('site not found with id=123\n')
+    assert err.endswith("site not found with id=123\n")
-    assert out == ''
+    assert out == ""
    with pytest.raises(SystemExit):
-        brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc'])
+        brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"])
    out, err = capsys.readouterr()
-    assert err.endswith('''job not found with id='abc'\n''')
+    assert err.endswith("""job not found with id='abc'\n""")
-    assert out == ''
+    assert out == ""
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 test_units.py - some unit tests for parts of brozzler amenable to that
 Copyright (C) 2016-2017 Internet Archive
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import pytest
 import http.server
@ -37,99 +37,131 @@ import threading
 from unittest import mock
 logging.basicConfig(
-        stream=sys.stderr, level=logging.INFO, format=(
+    stream=sys.stderr,
-            '%(asctime)s %(process)d %(levelname)s %(threadName)s '
+    level=logging.INFO,
-            '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
+    format=(
        "%(asctime)s %(process)d %(levelname)s %(threadName)s "
        "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
    ),
 )
-@pytest.fixture(scope='module')
+
@pytest.fixture(scope="module")
 def httpd(request):
    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
-    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+    os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
    httpd = http.server.HTTPServer(
-            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
+        ("localhost", 0), http.server.SimpleHTTPRequestHandler
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    )
    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()
    def fin():
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
    request.addfinalizer(fin)
    return httpd
 def test_robots(httpd):
-    '''
+    """
    Basic test of robots.txt user-agent substring matching.
-    '''
+    """
-    url = 'http://localhost:%s/' % httpd.server_port
+    url = "http://localhost:%s/" % httpd.server_port
-    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
+    site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
    assert brozzler.is_permitted_by_robots(site, url)
-    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
+    site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
    assert not brozzler.is_permitted_by_robots(site, url)
 def test_robots_http_statuses():
    for status in (
-            200, 204, 400, 401, 402, 403, 404, 405,
+        200,
-            500, 501, 502, 503, 504, 505):
+        204,
        400,
        401,
        402,
        403,
        404,
        405,
        500,
        501,
        502,
        503,
        504,
        505,
    ):
        class Handler(http.server.BaseHTTPRequestHandler):
            def do_GET(self):
-                response = (('HTTP/1.1 %s Meaningless message\r\n'
+                response = (
-                          + 'Content-length: 0\r\n'
+                    (
-                          + '\r\n') % status).encode('utf-8')
+                        "HTTP/1.1 %s Meaningless message\r\n"
                        + "Content-length: 0\r\n"
                        + "\r\n"
                    )
                    % status
                ).encode("utf-8")
                self.connection.sendall(response)
                # self.send_response(status)
                # self.end_headers()
-        httpd = http.server.HTTPServer(('localhost', 0), Handler)
+
-        httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+        httpd = http.server.HTTPServer(("localhost", 0), Handler)
        httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
        httpd_thread.start()
        try:
-            url = 'http://localhost:%s/' % httpd.server_port
+            url = "http://localhost:%s/" % httpd.server_port
-            site = brozzler.Site(None, {'seed': url})
+            site = brozzler.Site(None, {"seed": url})
            assert brozzler.is_permitted_by_robots(site, url)
        finally:
            httpd.shutdown()
            httpd.server_close()
            httpd_thread.join()
 def test_robots_empty_response():
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            self.connection.shutdown(socket.SHUT_RDWR)
            self.connection.close()
-    httpd = http.server.HTTPServer(('localhost', 0), Handler)
+
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd = http.server.HTTPServer(("localhost", 0), Handler)
    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()
    try:
-        url = 'http://localhost:%s/' % httpd.server_port
+        url = "http://localhost:%s/" % httpd.server_port
-        site = brozzler.Site(None, {'seed': url})
+        site = brozzler.Site(None, {"seed": url})
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
 def test_robots_socket_timeout():
    stop_hanging = threading.Event()
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            stop_hanging.wait(60)
-            self.connection.sendall(
+            self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")
                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
-    httpd = http.server.HTTPServer(('localhost', 0), Handler)
+    httpd = http.server.HTTPServer(("localhost", 0), Handler)
-    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
    httpd_thread.start()
    try:
-        url = 'http://localhost:%s/' % httpd.server_port
+        url = "http://localhost:%s/" % httpd.server_port
-        site = brozzler.Site(None, {'seed': url})
+        site = brozzler.Site(None, {"seed": url})
        brozzler.robots._SessionRaiseOn420.timeout = 2
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
        httpd.server_close()
        httpd_thread.join()
 def test_robots_dns_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
-    url = 'http://whatever.invalid./'
+    url = "http://whatever.invalid./"
-    site = brozzler.Site(None, {'seed': url})
+    site = brozzler.Site(None, {"seed": url})
    assert brozzler.is_permitted_by_robots(site, url)
 def test_robots_connection_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
-    url = 'http://localhost:4/' # nobody listens on port 4
+    url = "http://localhost:4/"  # nobody listens on port 4
-    site = brozzler.Site(None, {'seed': url})
+    site = brozzler.Site(None, {"seed": url})
    assert brozzler.is_permitted_by_robots(site, url)
 def test_scoping():
-    test_scope = yaml.safe_load('''
+    test_scope = yaml.safe_load(
        """
 max_hops: 100
 accepts:
 - url_match: REGEX_MATCH
@ -169,40 +205,73 @@ blocks:
 - domain: twitter.com
  url_match: REGEX_MATCH
  value: ^.*lang=(?!en).*$
-''')
+"""
    )
-    site = brozzler.Site(None, {
+    site = brozzler.Site(
-        'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
+        None,
-        'scope': test_scope})
+        {
-    page = brozzler.Page(None, {
+            "id": 1,
-        'url': 'http://example.com/foo/bar?baz=quux#monkey',
+            "seed": "http://example.com/foo/bar?baz=quux#monkey",
-        'site_id': site.id})
+            "scope": test_scope,
        },
    )
    page = brozzler.Page(
        None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
    )
-    assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
+    assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
-    assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
+    assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None
-    assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
+    assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
-    assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
+    assert (
        site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
        is True
    )
-    assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
+    assert (
-    assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
+        site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
    )
    assert (
        site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
    )
-    assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
+    assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
-    assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
+    assert (
-    assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
+        site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
    )
    assert (
        site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
    )
-    assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
+    assert (
        site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
    )
    assert (
        site.accept_reject_or_neither(
            "https://www.youtube.com/watch?v=dUIn5OAPS5s", page
        )
        is None
    )
    yt_user_page = brozzler.Page(
        None,
        {
            "url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
            "site_id": site.id,
            "hops_from_seed": 10,
        },
    )
    assert (
        site.accept_reject_or_neither(
            "https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
        )
        is True
    )
    assert site.accept_reject_or_neither(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
    yt_user_page = brozzler.Page(None, {
        'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
        'site_id': site.id, 'hops_from_seed': 10})
    assert site.accept_reject_or_neither(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
 def test_proxy_down():
-    '''
+    """
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
    This test needs to cover every possible fetch through the proxy other than
@ -211,24 +280,24 @@ def test_proxy_down():
    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
-    '''
+    """
    sock = socket.socket()
-    sock.bind(('127.0.0.1', 0))
+    sock.bind(("127.0.0.1", 0))
-    for not_listening_proxy in (
+    for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
-            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
+        worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
-        worker = brozzler.BrozzlerWorker(
+        site = brozzler.Site(
-                frontier=None, proxy=not_listening_proxy)
+            None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
-        site = brozzler.Site(None, {
+        )
-            'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
+        page = brozzler.Page(None, {"url": "http://example.com/"})
        page = brozzler.Page(None, {'url': 'http://example.com/'})
        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(
-                    site, 'http://example.com/', proxy=not_listening_proxy)
+                site, "http://example.com/", proxy=not_listening_proxy
            )
        # youtube-dl fetch
-        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+        with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
            with pytest.raises(brozzler.ProxyError):
                brozzler.ydl.do_youtube_dl(worker, site, page)
@ -240,46 +309,57 @@ def test_proxy_down():
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
                warcprox_address=not_listening_proxy,
-                    url='test://proxy_down/warcprox_write_record',
+                url="test://proxy_down/warcprox_write_record",
-                    warc_type='metadata',
+                warc_type="metadata",
-                    content_type='text/plain',
+                content_type="text/plain",
-                    payload=b'''payload doesn't matter here''')
+                payload=b"""payload doesn't matter here""",
            )
 def test_start_stop_backwards_compat():
-    site = brozzler.Site(None, {'seed': 'http://example.com/'})
+    site = brozzler.Site(None, {"seed": "http://example.com/"})
    assert len(site.starts_and_stops) == 1
-    assert site.starts_and_stops[0]['start']
+    assert site.starts_and_stops[0]["start"]
-    assert site.starts_and_stops[0]['stop'] is None
+    assert site.starts_and_stops[0]["stop"] is None
-    assert not 'start_time' in site
+    assert not "start_time" in site
-    site = brozzler.Site(None, {
+    site = brozzler.Site(
-        'seed': 'http://example.com/',
+        None,
-        'start_time': datetime.datetime(2017,1,1)})
+        {"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
    )
    assert len(site.starts_and_stops) == 1
-    assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
+    assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
-    assert site.starts_and_stops[0]['stop'] is None
+    assert site.starts_and_stops[0]["stop"] is None
-    assert not 'start_time' in site
+    assert not "start_time" in site
-    job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
+    job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
-    assert job.starts_and_stops[0]['start']
+    assert job.starts_and_stops[0]["start"]
-    assert job.starts_and_stops[0]['stop'] is None
+    assert job.starts_and_stops[0]["stop"] is None
-    assert not 'started' in job
+    assert not "started" in job
-    assert not 'finished' in job
+    assert not "finished" in job
    job = brozzler.Job(
        None,
        {
            "seeds": [{"url": "https://example.com/"}],
            "started": datetime.datetime(2017, 1, 1),
            "finished": datetime.datetime(2017, 1, 2),
        },
    )
    assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
    assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
    assert not "started" in job
    assert not "finished" in job
    job = brozzler.Job(None, {
        'seeds': [{'url':'https://example.com/'}],
        'started': datetime.datetime(2017, 1, 1),
        'finished': datetime.datetime(2017, 1, 2)})
    assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
    assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
    assert not 'started' in job
    assert not 'finished' in job
 class Exception1(Exception):
    pass
 class Exception2(Exception):
    pass
 def test_thread_raise_not_accept():
    def never_accept():
        try:
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
    th.join()
    assert thread_caught_exception is None
 def test_thread_raise_immediate():
    def accept_immediately():
        try:
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
    assert isinstance(thread_caught_exception, Exception1)
    assert time.time() - start < 1.0
 def test_thread_raise_safe_exit():
    def delay_context_exit():
        gate = brozzler.thread_accept_exceptions()
        orig_exit = type(gate).__exit__
        try:
            type(gate).__exit__ = lambda self, et, ev, t: (
-                    brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
+                brozzler.sleep(2),
                orig_exit(self, et, ev, t),
                False,
            )[-1]
            with brozzler.thread_accept_exceptions() as gate:
                brozzler.sleep(2)
        except Exception as e:
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
    assert thread_caught_exception
    assert isinstance(thread_caught_exception, Exception1)
 def test_thread_raise_pending_exception():
    def accept_eventually():
        try:
@ -365,6 +451,7 @@ def test_thread_raise_pending_exception():
    assert isinstance(thread_caught_exception, Exception1)
    assert time.time() - start > 1.0
 def test_thread_raise_second_with_block():
    def two_with_blocks():
        try:
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
    th.join()
    assert isinstance(thread_caught_exception, Exception2)
 def test_needs_browsing():
    # only one test case here right now, which exposed a bug
    class ConvenientHeaders(http.client.HTTPMessage):
        def __init__(self, headers):
            http.client.HTTPMessage.__init__(self)
-            for (k, v) in headers.items():
+            for k, v in headers.items():
                self.add_header(k, v)
-    page = brozzler.Page(None, {
+    page = brozzler.Page(None, {"url": "http://example.com/a"})
        'url':'http://example.com/a'})
    spy = brozzler.ydl.YoutubeDLSpy()
-    spy.fetches.append({
+    spy.fetches.append(
-        'url': 'http://example.com/a',
+        {
-        'method': 'HEAD',
+            "url": "http://example.com/a",
-        'response_code': 301,
+            "method": "HEAD",
-        'response_headers': ConvenientHeaders({'Location': '/b'})})
+            "response_code": 301,
-    spy.fetches.append({
+            "response_headers": ConvenientHeaders({"Location": "/b"}),
-        'url': 'http://example.com/b',
+        }
-        'method': 'GET',
+    )
-        'response_code': 200,
+    spy.fetches.append(
-        'response_headers': ConvenientHeaders({
+        {
-            'Content-Type': 'application/pdf'})})
+            "url": "http://example.com/b",
            "method": "GET",
            "response_code": 200,
            "response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
        }
    )
    assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)
    assert not brozzler.worker.BrozzlerWorker._needs_browsing(
            None, page, spy.fetches)
 def test_seed_redirect():
-    site = brozzler.Site(None, {'seed': 'http://foo.com/'})
+    site = brozzler.Site(None, {"seed": "http://foo.com/"})
-    site.note_seed_redirect('https://foo.com/a/b/c')
+    site.note_seed_redirect("https://foo.com/a/b/c")
-    assert site.scope == {'accepts': [
+    assert site.scope == {
-        {'ssurt': 'com,foo,//http:/',},
+        "accepts": [
-        {'ssurt': 'com,foo,//https:/',}]}
+            {
                "ssurt": "com,foo,//http:/",
            },
            {
                "ssurt": "com,foo,//https:/",
            },
        ]
    }
-    site = brozzler.Site(None, {'seed': 'https://foo.com/'})
+    site = brozzler.Site(None, {"seed": "https://foo.com/"})
-    site.note_seed_redirect('http://foo.com/a/b/c')
+    site.note_seed_redirect("http://foo.com/a/b/c")
-    assert site.scope == {'accepts': [
+    assert site.scope == {
-        {'ssurt': 'com,foo,//https:/',},
+        "accepts": [
-        {'ssurt': 'com,foo,//http:/',}]}
+            {
                "ssurt": "com,foo,//https:/",
            },
            {
                "ssurt": "com,foo,//http:/",
            },
        ]
    }
    site = brozzler.Site(None, {"seed": "http://foo.com/"})
    site.note_seed_redirect("https://bar.com/a/b/c")
    assert site.scope == {
        "accepts": [
            {
                "ssurt": "com,foo,//http:/",
            },
            {
                "ssurt": "com,bar,//https:/a/b/c",
            },
        ]
    }
    site = brozzler.Site(None, {'seed': 'http://foo.com/'})
    site.note_seed_redirect('https://bar.com/a/b/c')
    assert site.scope == {'accepts': [
        {'ssurt': 'com,foo,//http:/',},
        {'ssurt': 'com,bar,//https:/a/b/c',}]}
 def test_limit_failures():
    page = mock.Mock()
@ -446,9 +560,9 @@ def test_limit_failures():
    page.brozzle_count = 0
    site = mock.Mock()
-    site.status = 'ACTIVE'
+    site.status = "ACTIVE"
    site.active_brozzling_time = 0
-    site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
+    site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
    rr = mock.Mock()
    rr.servers = [mock.Mock()]
@ -458,9 +572,10 @@ def test_limit_failures():
    rr.table = mock.Mock(
        return_value=mock.Mock(
            between=mock.Mock(
-                    return_value=mock.Mock(
+                return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
-                        limit=mock.Mock(
+            )
-                            return_value=rethink_query)))))
+        )
    )
    assert rr.table().between().limit().run() == []
    frontier = brozzler.RethinkDbFrontier(rr)
    frontier.enforce_time_limit = mock.Mock()
@ -475,20 +590,19 @@ def test_limit_failures():
    assert page.failed_attempts is None
    assert page.brozzle_count == 0
-    assert site.status == 'ACTIVE'
+    assert site.status == "ACTIVE"
    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 1
    assert page.brozzle_count == 0
-    assert site.status == 'ACTIVE'
+    assert site.status == "ACTIVE"
    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 2
    assert page.brozzle_count == 0
-    assert site.status == 'ACTIVE'
+    assert site.status == "ACTIVE"
    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 3
    assert page.brozzle_count == 1
-    assert site.status == 'FINISHED'
+    assert site.status == "FINISHED"
--- a/vagrant/vagrant-brozzler-new-job.py
+++ b/vagrant/vagrant-brozzler-new-job.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
 queue a job for your vagrant brozzler deployment.
@ -20,30 +20,39 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import sys
 import os
 import argparse
 import subprocess
 def main(argv=[]):
    arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
    arg_parser.add_argument(
-            'job_conf_file', metavar='JOB_CONF_FILE',
+        "job_conf_file",
-            help='brozzler job configuration file in yaml')
+        metavar="JOB_CONF_FILE",
        help="brozzler job configuration file in yaml",
    )
    args = arg_parser.parse_args(args=argv[1:])
    # cd to path with Vagrantfile so "vagrant ssh" knows what to do
    os.chdir(os.path.dirname(__file__))
-    with open(args.job_conf_file, 'rb') as f:
+    with open(args.job_conf_file, "rb") as f:
-        subprocess.call([
+        subprocess.call(
-            'vagrant', 'ssh', '--',
+            [
-            'f=`mktemp` && cat > $f && '
+                "vagrant",
-            '/home/vagrant/brozzler-ve3/bin/python '
+                "ssh",
-            '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
+                "--",
-            stdin=f)
+                "f=`mktemp` && cat > $f && "
                "/home/vagrant/brozzler-ve3/bin/python "
                "/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f",
            ],
            stdin=f,
        )
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    main(sys.argv)
--- a/vagrant/vagrant-brozzler-new-site.py
+++ b/vagrant/vagrant-brozzler-new-site.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-'''
+"""
 vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
 queue a site for your vagrant brozzler deployment.
@ -23,61 +23,69 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-'''
+"""
 import sys
 import os
 import argparse
 import subprocess
 try:
    from shlex import quote
 except:
    from pipes import quote
 def main(argv=[]):
    arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
-    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
+    arg_parser.add_argument("seed", metavar="SEED", help="seed url")
    arg_parser.add_argument(
-            '--time-limit', dest='time_limit', default=None,
+        "--time-limit",
-            help='time limit in seconds for this site')
+        dest="time_limit",
        default=None,
        help="time limit in seconds for this site",
    )
    arg_parser.add_argument(
-            '--ignore-robots', dest='ignore_robots', action='store_true',
+        "--ignore-robots",
-            help='ignore robots.txt for this site')
+        dest="ignore_robots",
        action="store_true",
        help="ignore robots.txt for this site",
    )
    arg_parser.add_argument(
-            '--warcprox-meta', dest='warcprox_meta',
+        "--warcprox-meta",
        dest="warcprox_meta",
        help=(
-                'Warcprox-Meta http request header to send with each request; '
+            "Warcprox-Meta http request header to send with each request; "
-                'must be a json blob, ignored unless warcprox features are '
+            "must be a json blob, ignored unless warcprox features are "
-                'enabled'))
+            "enabled"
-    arg_parser.add_argument(
+        ),
-            '-q', '--quiet', dest='quiet', action='store_true')
+    )
-    arg_parser.add_argument(
+    arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true")
-            '-v', '--verbose', dest='verbose', action='store_true')
+    arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true")
    args = arg_parser.parse_args(args=argv[1:])
    options = []
    if args.time_limit:
-        options.append('--time-limit=%s' % args.time_limit)
+        options.append("--time-limit=%s" % args.time_limit)
    if args.ignore_robots:
-        options.append('--ignore-robots')
+        options.append("--ignore-robots")
    if args.warcprox_meta:
        # I think this shell escaping is correct?
-        options.append(
+        options.append("--warcprox-meta=%s" % quote(args.warcprox_meta))
                '--warcprox-meta=%s' % quote(args.warcprox_meta))
    if args.quiet:
-        options.append('--quiet')
+        options.append("--quiet")
    if args.verbose:
-        options.append('--verbose')
+        options.append("--verbose")
    # cd to path with Vagrantfile so "vagrant ssh" knows what to do
    os.chdir(os.path.dirname(__file__))
    cmd = (
-        '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
+        "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
-        '%s %s') % (' '.join(options), args.seed)
+    ) % (" ".join(options), args.seed)
-    subprocess.call(['vagrant', 'ssh', '--', cmd])
+    subprocess.call(["vagrant", "ssh", "--", cmd])
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    main(sys.argv)