Merge branch 'logging' into qa

2025-02-24 08:39:59 -05:00 · 2025-02-21 09:27:44 -08:00 · 2025-02-21 09:27:44 -08:00 · 5f1e92c23f
commit 5f1e92c23f
parent a158bed410 b79054d27e
15 changed files with 446 additions and 397 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -18,6 +18,7 @@ limitations under the License.
 """

 import logging
+import structlog
 from pkg_resources import get_distribution as _get_distribution

 __version__ = _get_distribution("brozzler").version
@ -79,32 +80,6 @@ class ReachedLimit(Exception):
        return self.__repr__()


-# monkey-patch log levels TRACE and NOTICE
-logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
-
-
-def _logger_trace(self, msg, *args, **kwargs):
-    if self.isEnabledFor(logging.TRACE):
-        self._log(logging.TRACE, msg, args, **kwargs)
-
-
-logging.Logger.trace = _logger_trace
-logging.trace = logging.root.trace
-logging.addLevelName(logging.TRACE, "TRACE")
-
-logging.NOTICE = (logging.INFO + logging.WARN) // 2
-
-
-def _logger_notice(self, msg, *args, **kwargs):
-    if self.isEnabledFor(logging.NOTICE):
-        self._log(logging.NOTICE, msg, args, **kwargs)
-
-
-logging.Logger.notice = _logger_notice
-logging.notice = logging.root.notice
-logging.addLevelName(logging.NOTICE, "NOTICE")
-
-
 # see https://github.com/internetarchive/brozzler/issues/91
 def _logging_handler_handle(self, record):
    rv = self.filter(record)
@ -146,7 +121,9 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
    """
    Returns the javascript behavior string populated with template_parameters.
    """
-    import re, logging, json
+    import re, json
+
+    logger = structlog.get_logger()

    for behavior in behaviors(behaviors_dir=behaviors_dir):
        if re.match(behavior["url_regex"], url):
@ -159,18 +136,18 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
                behavior["behavior_js_template"]
            )
            script = template.render(parameters)
-            logging.info(
-                "using template=%r populated with parameters=%r for %r",
-                behavior["behavior_js_template"],
-                json.dumps(parameters),
-                url,
+            logger.info(
+                "rendering template",
+                template=behavior["behavior_js_template"],
+                parameters=json.dumps(parameters),
+                url=url,
            )
            return script
    return None


 class ThreadExceptionGate:
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, thread):
        self.thread = thread
@ -181,7 +158,9 @@ class ThreadExceptionGate:
    def __enter__(self):
        assert self.thread == threading.current_thread()
        if self.pending_exception:
-            self.logger.info("raising pending exception %s", self.pending_exception)
+            self.logger.info(
+                "raising pending exception", pending_exception=self.pending_exception
+            )
            tmp = self.pending_exception
            self.pending_exception = None
            raise tmp
@ -198,10 +177,10 @@ class ThreadExceptionGate:
        with self.lock:
            if self.pending_exception:
                self.logger.warning(
-                    "%r already pending for thread %r, discarding %r",
-                    self.pending_exception,
-                    self.thread,
-                    e,
+                    "exception already pending for thread, discarding",
+                    pending_exception=self.pending_exception,
+                    thread=self.thread,
+                    exception=e,
                )
            else:
                self.pending_exception = e
@ -266,7 +245,9 @@ def thread_raise(thread, exctype):
        TypeError if `exctype` is not a class
        ValueError, SystemError in case of unexpected problems
    """
-    import ctypes, inspect, threading, logging
+    import ctypes, inspect, threading, structlog
+
+    logger = structlog.get_logger(exctype=exctype, thread=thread)

    if not inspect.isclass(exctype):
        raise TypeError(
@ -278,7 +259,7 @@ def thread_raise(thread, exctype):
    with gate.lock:
        if gate.ok_to_raise.is_set() and thread.is_alive():
            gate.ok_to_raise.clear()
-            logging.info("raising %s in thread %s", exctype, thread)
+            logger.info("raising exception in thread")
            res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
                ctypes.c_long(thread.ident), ctypes.py_object(exctype)
            )
@ -290,7 +271,7 @@ def thread_raise(thread, exctype):
                ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
                raise SystemError("PyThreadState_SetAsyncExc failed")
        else:
-            logging.info("queueing %s for thread %s", exctype, thread)
+            logger.info("queueing exception for thread")
            gate.queue_exception(exctype)


--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -31,6 +31,7 @@ import base64
 from ipaddress import AddressValueError
 from brozzler.chrome import Chrome
 import socket
+import structlog
 import urlcanon

 MAX_UNMATCHED_INVALID_CHECKS = 5
@ -53,7 +54,7 @@ class BrowserPool:
    debugging protocol.
    """

-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, size=3, **kwargs):
        """
@ -144,7 +145,7 @@ class BrowserPool:


 class WebsockReceiverThread(threading.Thread):
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, websock, name=None, daemon=True):
        super().__init__(name=name, daemon=daemon)
@ -194,7 +195,7 @@ class WebsockReceiverThread(threading.Thread):
        ):
            self.logger.error("websocket closed, did chrome die?")
        else:
-            self.logger.error("exception from websocket receiver thread", exc_info=1)
+            self.logger.exception("exception from websocket receiver thread")
        brozzler.thread_raise(self.calling_thread, BrowsingException)

    def run(self):
@ -214,10 +215,9 @@ class WebsockReceiverThread(threading.Thread):
        try:
            self._handle_message(websock, message)
        except:
-            self.logger.error(
-                "uncaught exception in _handle_message message=%s",
-                message,
-                exc_info=True,
+            self.logger.exception(
+                "uncaught exception in _handle_message",
+                message=message,
            )

    def _network_response_received(self, message):
@ -232,7 +232,7 @@ class WebsockReceiverThread(threading.Thread):
                    ]
                )
                self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
-                self.logger.info("reached limit %s", self.reached_limit)
+                self.logger.info("reached limit", limit=self.reached_limit)
                brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit)
            else:
                self.logger.info(
@ -246,7 +246,7 @@ class WebsockReceiverThread(threading.Thread):
            self.page_status = status

    def _javascript_dialog_opening(self, message):
-        self.logger.info("javascript dialog opened: %s", message)
+        self.logger.info("javascript dialog opened", message=message)
        if message["params"]["type"] == "alert":
            accept = True
        else:
@ -293,7 +293,7 @@ class WebsockReceiverThread(threading.Thread):
                    message["params"]["message"]["text"],
                )
            elif message["method"] == "Runtime.exceptionThrown":
-                self.logger.debug("uncaught exception: %s", message)
+                self.logger.debug("uncaught exception", exception=message)
            elif message["method"] == "Page.javascriptDialogOpening":
                self._javascript_dialog_opening(message)
            elif (
@ -323,7 +323,7 @@ class Browser:
    Manages an instance of Chrome for browsing pages.
    """

-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, **kwargs):
        """
@ -366,11 +366,10 @@ class Browser:
        msg_id = next(self._command_id)
        kwargs["id"] = msg_id
        msg = json.dumps(kwargs, separators=",:")
-        logging.log(
-            logging.TRACE if suppress_logging else logging.DEBUG,
-            "sending message to %s: %s",
-            self.websock,
-            msg,
+        self.logger.debug(
+            "sending message",
+            websock=self.websock,
+            message=msg,
        )
        self.websock.send(msg)
        return msg_id
@ -398,7 +397,7 @@ class Browser:
            # Enable Console & Runtime output only when debugging.
            # After all, we just print these events with debug(), we don't use
            # them in Brozzler logic.
-            if self.logger.isEnabledFor(logging.DEBUG):
+            if self.logger.is_enabled_for(logging.DEBUG):
                self.send_to_chrome(method="Console.enable")
                self.send_to_chrome(method="Runtime.enable")
            self.send_to_chrome(method="ServiceWorker.enable")
@ -433,8 +432,8 @@ class Browser:
                try:
                    self.websock.close()
                except BaseException as e:
-                    self.logger.error(
-                        "exception closing websocket %s - %s", self.websock, e
+                    self.logger.exception(
+                        "exception closing websocket", websocket=self.websock
                    )

            self.chrome.stop()
@ -461,7 +460,7 @@ class Browser:

            self.websock_url = None
        except:
-            self.logger.error("problem stopping", exc_info=True)
+            self.logger.exception("problem stopping")

    def is_running(self):
        return self.websock_url is not None
@ -567,7 +566,7 @@ class Browser:
                    # if login redirected us, return to page_url
                    if page_url != self.url().split("#")[0]:
                        self.logger.debug(
-                            "login navigated away from %s; returning!", page_url
+                            "login navigated away; returning!", page_url=page_url
                        )
                        self.navigate_to_page(page_url, timeout=page_timeout)
                # If the target page HTTP status is 4xx/5xx, there is no point
@ -611,7 +610,7 @@ class Browser:
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
-            self.logger.error("websocket closed, did chrome die?")
+            self.logger.exception("websocket closed, did chrome die?")
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
@ -633,7 +632,7 @@ class Browser:
                on_screenshot(jpeg_bytes)
                return
            except BrowsingTimeout as e:
-                logging.error("attempt %s/3: %s", i + 1, e)
+                self.logger.exception("attempt %s/3", i + 1)

    def visit_hashtags(self, page_url, hashtags, outlinks):
        _hashtags = set(hashtags or [])
@ -647,7 +646,7 @@ class Browser:
        # out which hashtags were visited already and skip those
        for hashtag in _hashtags:
            # navigate_to_hashtag (nothing to wait for so no timeout?)
-            self.logger.debug("navigating to hashtag %s", hashtag)
+            self.logger.debug("navigating to hashtag", hashtag=hashtag)
            url = urlcanon.whatwg(page_url)
            url.hash_sign = b"#"
            url.fragment = hashtag[1:].encode("utf-8")
@ -687,7 +686,7 @@ class Browser:
            )

    def navigate_to_page(self, page_url, timeout=300):
-        self.logger.info("navigating to page %s", page_url)
+        self.logger.info("navigating to page", page_url=page_url)
        self.websock_thread.got_page_load_event = None
        self.websock_thread.page_status = None
        self.send_to_chrome(method="Page.navigate", params={"url": page_url})
@ -715,15 +714,13 @@ class Browser:
                    try:
                        out.append(str(urlcanon.whatwg(link)))
                    except AddressValueError:
-                        self.logger.warning("skip invalid outlink: %s", link)
+                        self.logger.warning("skip invalid outlink", outlink=link)
                return frozenset(out)
            else:
                # no links found
                return frozenset()
        else:
-            self.logger.error(
-                "problem extracting outlinks, result message: %s", message
-            )
+            self.logger.error("problem extracting outlinks", message=message)
            return frozenset()

    def screenshot(self, full_page=False, timeout=45):
@ -797,11 +794,11 @@ class Browser:
            elapsed = time.time() - start
            if elapsed > timeout:
                logging.info(
-                    "behavior reached hard timeout after %.1fs and %s valid checks, and %s invalid checks, for url %s",
+                    "behavior reached hard timeout after %.1fs and %s valid checks, and %s invalid checks",
                    elapsed,
                    valid_behavior_checks,
                    invalid_behavior_checks,
-                    page_url,
+                    page_url=page_url,
                )
                return

--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -16,7 +16,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

-import logging
 import urllib.request
 import time
 import threading
@ -27,6 +26,7 @@ import select
 import re
 import signal
 import sqlite3
+import structlog
 import json
 import tempfile
 import sys
@ -65,7 +65,7 @@ def check_version(chrome_exe):


 class Chrome:
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
        """
@ -97,22 +97,22 @@ class Chrome:
    def _init_cookie_db(self, cookie_db):
        cookie_dir = os.path.join(self._chrome_user_data_dir, "Default")
        cookie_location = os.path.join(cookie_dir, "Cookies")
-        self.logger.debug("cookie DB provided, writing to %s", cookie_location)
+        cookie_logger = self.logger.bind(cookie_location=cookie_location)
+
+        cookie_logger.debug("cookie DB provided, writing to")
        os.makedirs(cookie_dir, exist_ok=True)

        try:
            with open(cookie_location, "wb") as cookie_file:
                cookie_file.write(cookie_db)
        except OSError:
-            self.logger.error(
-                "exception writing cookie file at %s", cookie_location, exc_info=True
-            )
+            cookie_logger.exception("exception writing cookie file")

    def persist_and_read_cookie_db(self):
        cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies")
-        self.logger.debug(
-            "marking cookies persistent then reading file into memory: %s",
-            cookie_location,
+        cookie_logger = self.logger.bind(cookie_location=cookie_location)
+        cookie_logger.debug(
+            "marking cookies persistent then reading file into memory",
        )
        try:
            with sqlite3.connect(cookie_location) as conn:
@ -125,20 +125,14 @@ class Chrome:
                    cur = conn.cursor()
                    cur.execute("UPDATE cookies SET persistent = 1")
            except sqlite3.Error:
-                self.logger.error(
-                    "exception updating cookie DB %s", cookie_location, exc_info=True
-                )
+                cookie_logger.exception("exception updating cookie DB")

        cookie_db = None
        try:
            with open(cookie_location, "rb") as cookie_file:
                cookie_db = cookie_file.read()
        except OSError:
-            self.logger.error(
-                "exception reading from cookie DB file %s",
-                cookie_location,
-                exc_info=True,
-            )
+            cookie_logger.exception("exception reading from cookie DB file")
        return cookie_db

    def start(
@ -228,7 +222,7 @@ class Chrome:
        if proxy:
            chrome_args.append("--proxy-server=%s" % proxy)
        chrome_args.append("about:blank")
-        self.logger.info("running: %r", subprocess.list2cmdline(chrome_args))
+        self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
        # start_new_session - new process group so we can kill the whole group
        self.chrome_process = subprocess.Popen(
            chrome_args,
@ -244,12 +238,13 @@ class Chrome:
            daemon=True,
        )
        self._out_reader_thread.start()
-        self.logger.info("chrome running, pid %s" % self.chrome_process.pid)
+        self.logger.info("chrome running", pid=self.chrome_process.pid)

        return self._websocket_url(timeout_sec=websocket_timeout)

    def _websocket_url(self, timeout_sec=60):
        json_url = "http://localhost:%s/json" % self.port
+        url_logger = self.logger.bind(json_url=json_url)
        # make this a member variable so that kill -QUIT reports it
        self._start = time.time()
        self._last_warning = self._start
@ -260,24 +255,21 @@ class Chrome:
                debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]

                if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
-                    self.logger.debug("%s returned %s", json_url, raw_json)
+                    url_logger.debug("webSocketDebuggerUrl returned", raw_json=raw_json)
                    url = debug_info[0]["webSocketDebuggerUrl"]
-                    self.logger.info(
-                        "got chrome window websocket debug url %s from %s",
-                        url,
-                        json_url,
+                    url_logger.info(
+                        "got chrome window websocket debug url",
+                        debug_url=url,
                    )
                    return url
            except brozzler.ShutdownRequested:
                raise
            except Exception as e:
                if time.time() - self._last_warning > 30:
-                    self.logger.warning(
-                        "problem with %s (will keep trying until timeout "
-                        "of %d seconds): %s",
-                        json_url,
-                        timeout_sec,
-                        e,
+                    url_logger.warning(
+                        "problem accessing url (will keep trying until timeout)",
+                        timeout=timeout_sec,
+                        exc_info=True,
                    )
                    self._last_warning = time.time()
            finally:
@ -323,26 +315,28 @@ class Chrome:
            while not self._shutdown.is_set():
                buf = readline_nonblock(self.chrome_process.stdout)
                if buf:
-                    self.logger.trace(
+                    self.logger.debug(
                        "chrome pid %s STDOUT %s", self.chrome_process.pid, buf
                    )

                buf = readline_nonblock(self.chrome_process.stderr)
                if buf:
-                    self.logger.trace(
+                    self.logger.debug(
                        "chrome pid %s STDERR %s", self.chrome_process.pid, buf
                    )
        except:
-            self.logger.error("unexpected exception", exc_info=True)
+            self.logger.exception("unexpected exception")

    def stop(self):
        if not self.chrome_process or self._shutdown.is_set():
            return
        self._shutdown.set()

+        pid_logger = self.logger.bind(pid=self.chrome_process.pid)
+
        timeout_sec = 300
        if self.chrome_process.poll() is None:
-            self.logger.info("terminating chrome pgid %s", self.chrome_process.pid)
+            pid_logger.info("terminating chrome")

            os.killpg(self.chrome_process.pid, signal.SIGTERM)
        t0 = time.time()
@ -352,14 +346,11 @@ class Chrome:
                status = self.chrome_process.poll()
                if status is not None:
                    if status == 0:
-                        self.logger.info(
-                            "chrome pid %s exited normally", self.chrome_process.pid
-                        )
+                        pid_logger.info("chrome exited normally")
                    else:
-                        self.logger.warning(
-                            "chrome pid %s exited with nonzero status %s",
-                            self.chrome_process.pid,
-                            status,
+                        pid_logger.warning(
+                            "chrome exited with nonzero status",
+                            status=status,
                        )

                    # XXX I would like to forcefully kill the process group
@ -369,18 +360,16 @@ class Chrome:
                    return
                time.sleep(0.5)

-            self.logger.warning(
-                "chrome pid %s still alive %.1f seconds after sending "
+            pid_logger.warning(
+                "chrome still alive %.1f seconds after sending "
                "SIGTERM, sending SIGKILL",
-                self.chrome_process.pid,
                time.time() - t0,
            )
            os.killpg(self.chrome_process.pid, signal.SIGKILL)
            status = self.chrome_process.wait()
-            self.logger.warning(
-                "chrome pid %s reaped (status=%s) after killing with " "SIGKILL",
-                self.chrome_process.pid,
-                status,
+            pid_logger.warning(
+                "chrome reaped after killing with " "SIGKILL",
+                status=status,
            )

        finally:
@ -389,8 +378,8 @@ class Chrome:
            try:
                self._home_tmpdir.cleanup()
            except:
-                self.logger.error(
-                    "exception deleting %s", self._home_tmpdir, exc_info=True
+                self.logger.exception(
+                    "exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
                )
            self._out_reader_thread.join()
            self.chrome_process = None
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -29,6 +29,8 @@ import requests
 import doublethink
 import signal
 import string
+import structlog
+import subprocess
 import sys
 import threading
 import time
@ -41,6 +43,8 @@ import rethinkdb as rdb

 r = rdb.RethinkDB()

+logger = structlog.get_logger()
+

 def add_common_options(arg_parser, argv=None):
    argv = argv or sys.argv
@ -50,7 +54,7 @@ def add_common_options(arg_parser, argv=None):
        dest="log_level",
        action="store_const",
        default=logging.INFO,
-        const=logging.NOTICE,
+        const=logging.WARN,
        help="quiet logging",
    )
    arg_parser.add_argument(
@ -67,7 +71,7 @@ def add_common_options(arg_parser, argv=None):
        dest="log_level",
        action="store_const",
        default=logging.INFO,
-        const=logging.TRACE,
+        const=logging.DEBUG,
        help=("very verbose logging"),
    )
    # arg_parser.add_argument(
@ -108,7 +112,50 @@ def rethinker(args):
    return doublethink.Rethinker(servers.split(","), db)


+# Decorates the logger name with call location, if provided
+def decorate_logger_name(a, b, event_dict):
+    old_name = event_dict.get("logger_name")
+    if old_name is None:
+        return event_dict
+
+    try:
+        filename = event_dict.pop("filename")
+        func_name = event_dict.pop("func_name")
+        lineno = event_dict.pop("lineno")
+    except KeyError:
+        return event_dict
+    new_name = f"{old_name}.{func_name}({filename}:{lineno})"
+    event_dict["logger_name"] = new_name
+
+    return event_dict
+
+
 def configure_logging(args):
+    structlog.configure(
+        processors=[
+            structlog.contextvars.merge_contextvars,
+            structlog.processors.add_log_level,
+            structlog.processors.StackInfoRenderer(),
+            structlog.dev.set_exc_info,
+            structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
+            structlog.processors.CallsiteParameterAdder(
+                [
+                    structlog.processors.CallsiteParameter.FILENAME,
+                    structlog.processors.CallsiteParameter.FUNC_NAME,
+                    structlog.processors.CallsiteParameter.LINENO,
+                ],
+            ),
+            decorate_logger_name,
+            structlog.dev.ConsoleRenderer(),
+        ],
+        wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
+        context_class=dict,
+        logger_factory=structlog.PrintLoggerFactory(),
+        cache_logger_on_first_use=False,
+    )
+
+    # We still configure logging for now because its handlers
+    # are used for the gunicorn spawned by the brozzler dashboard.
    logging.basicConfig(
        stream=sys.stderr,
        level=args.log_level,
@ -126,8 +173,36 @@ def configure_logging(args):
    )


-def suggest_default_chrome_exe():
-    # mac os x application executable paths
+def mdfind(identifier):
+    try:
+        result = subprocess.check_output(
+            ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
+        )
+    # Just treat any errors as "couldn't find app"
+    except subprocess.CalledProcessError:
+        return None
+
+    if result:
+        return result.rstrip("\n")
+
+
+def suggest_default_chrome_exe_mac():
+    path = None
+    # Try Chromium first, then Chrome
+    result = mdfind("org.chromium.Chromium")
+    if result is not None:
+        path = f"{result}/Contents/MacOS/Chromium"
+
+    result = mdfind("com.google.Chrome")
+    if result is not None:
+        path = f"{result}/Contents/MacOS/Google Chrome"
+
+    if path is not None and os.path.exists(path):
+        return path
+
+    # Fall back to default paths if mdfind couldn't find it
+    # (mdfind might fail to find them even in their default paths
+    # if the system has Spotlight disabled.)
    for path in [
        "/Applications/Thorium.app/Contents/MacOS/Thorium",
        "/Applications/Chromium.app/Contents/MacOS/Chromium",
@ -136,6 +211,14 @@ def suggest_default_chrome_exe():
        if os.path.exists(path):
            return path

+
+def suggest_default_chrome_exe():
+    # First ask mdfind, which lets us find it in non-default paths
+    if sys.platform == "darwin":
+        path = suggest_default_chrome_exe_mac()
+        if path is not None:
+            return path
+
    # "chromium-browser" is the executable on ubuntu trusty
    # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
    # google chrome executable names taken from these packages:
@ -319,7 +402,7 @@ def brozzle_page(argv=None):
        )
        with open(filename, "wb") as f:
            f.write(screenshot_jpeg)
-        logging.info("wrote screenshot to %s", filename)
+        logger.info("wrote screenshot", filename=filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    try:
@ -335,11 +418,11 @@ def brozzle_page(argv=None):
            on_screenshot=on_screenshot,
            enable_youtube_dl=not args.skip_youtube_dl,
        )
-        logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
+        logger.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
-        logging.error("reached limit %s", e)
+        logger.exception("reached limit")
    except brozzler.PageInterstitialShown as e:
-        logging.error("page interstitial shown %s", e)
+        logger.exception("page interstitial shown")
    finally:
        browser.stop()

@ -597,11 +680,11 @@ def brozzler_worker(argv=None):
                    state_strs.append("<???:thread:ident=%s>" % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append("".join(stack))
-            logging.info(
-                "dumping state (caught signal %s)\n%s" % (signum, "\n".join(state_strs))
+            logger.info(
+                "dumping state (caught signal)\n%s", signal=signum, state=state_strs
            )
        except BaseException as e:
-            logging.error("exception dumping state: %s" % e)
+            logger.exception("exception dumping state")
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

@ -612,13 +695,13 @@ def brozzler_worker(argv=None):
            with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
                ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
                if ytdlp_proxy_endpoints:
-                    logging.info(
-                        "running with ytdlp proxy endpoints file %s"
-                        % YTDLP_PROXY_ENDPOINTS_FILE
+                    logger.info(
+                        "running with ytdlp proxy endpoints file",
+                        ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
                    )
        except Exception as e:
            ytdlp_proxy_endpoints = []
-            logging.info("running with empty proxy endpoints file")
+            logger.info("running with empty proxy endpoints file")
        return ytdlp_proxy_endpoints

    rr = rethinker(args)
@ -650,7 +733,7 @@ def brozzler_worker(argv=None):
    th = threading.Thread(target=worker.run, name="BrozzlerWorkerThread")
    th.start()
    th.join()
-    logging.info("brozzler-worker is all done, exiting")
+    logger.info("brozzler-worker is all done, exiting")


 def brozzler_ensure_tables(argv=None):
@ -724,18 +807,18 @@ def brozzler_list_jobs(argv=None):
        except ValueError:
            job_id = args.job
        reql = rr.table("jobs").get(job_id)
-        logging.debug("querying rethinkdb: %s", reql)
+        logger.debug("querying rethinkdb", query=reql)
        result = reql.run()
        if result:
            results = [reql.run()]
        else:
-            logging.error("no such job with id %r", job_id)
+            logger.error("no such job with id", job_id=job_id)
            sys.exit(1)
    else:
        reql = rr.table("jobs").order_by("id")
        if args.active:
            reql = reql.filter({"status": "ACTIVE"})
-        logging.debug("querying rethinkdb: %s", reql)
+        logger.debug("querying rethinkdb", query=reql)
        results = reql.run()
    if args.yaml:
        yaml.dump_all(
@ -800,7 +883,7 @@ def brozzler_list_sites(argv=None):
        )
    elif args.site:
        reql = reql.get_all(args.site)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    results = reql.run()
    if args.yaml:
        yaml.dump_all(
@ -868,7 +951,7 @@ def brozzler_list_pages(argv=None):
        except ValueError:
            job_id = args.job
        reql = rr.table("sites").get_all(job_id, index="job_id")["id"]
-        logging.debug("querying rethinkb: %s", reql)
+        logger.debug("querying rethinkb", query=reql)
        site_ids = reql.run()
    elif args.site:
        try:
@ -897,7 +980,7 @@ def brozzler_list_pages(argv=None):
        reql = reql.order_by(index="least_hops")
        if args.claimed:
            reql = reql.filter({"claimed": True})
-        logging.debug("querying rethinkb: %s", reql)
+        logger.debug("querying rethinkb", query=reql)
        results = reql.run()
        if args.yaml:
            yaml.dump_all(
@ -963,20 +1046,20 @@ def brozzler_purge(argv=None):
            job_id = args.job
        job = brozzler.Job.load(rr, job_id)
        if not job:
-            logging.fatal("no such job %r", job_id)
+            logger.fatal("no such job", job_id=job_id)
            sys.exit(1)
        if job.status == "ACTIVE":
            if args.force:
-                logging.warning(
-                    "job %s has status ACTIVE, purging anyway because "
+                logger.warning(
+                    "job has status ACTIVE, purging anyway because "
                    "--force was supplied",
-                    job_id,
+                    job_id=job_id,
                )
            else:
-                logging.fatal(
-                    "refusing to purge job %s because status is ACTIVE "
+                logger.fatal(
+                    "refusing to purge job because status is ACTIVE "
                    "(override with --force)",
-                    job_id,
+                    job_id=job_id,
                )
                sys.exit(1)
        _purge_job(rr, job_id)
@ -984,20 +1067,20 @@ def brozzler_purge(argv=None):
        site_id = args.site
        site = brozzler.Site.load(rr, site_id)
        if not site:
-            logging.fatal("no such job %r", job_id)
+            logger.fatal("no such job", job_id=job_id)
            sys.exit(1)
        if site.status == "ACTIVE":
            if args.force:
-                logging.warning(
-                    "site %s has status ACTIVE, purging anyway because "
+                logger.warning(
+                    "site has status ACTIVE, purging anyway because "
                    "--force was supplied",
-                    site_id,
+                    site_id=site_id,
                )
            else:
-                logging.fatal(
-                    "refusing to purge site %s because status is ACTIVE "
+                logger.fatal(
+                    "refusing to purge site because status is ACTIVE "
                    "(override with --force)",
-                    site_id,
+                    site_id=site_id,
                )
                sys.exit(1)
        _purge_site(rr, site_id)
@ -1016,7 +1099,7 @@ def brozzler_purge(argv=None):
                .lt(finished_before)
            )
        )
-        logging.debug("retrieving jobs older than %s: %s", finished_before, reql)
+        logger.debug("retrieving jobs older than %s", finished_before, query=reql)
        for job in reql.run():
            # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
            #         job['id'], job.get('finished'),
@ -1034,27 +1117,31 @@ def _purge_site(rr, site_id):
        )
        .delete()
    )
-    logging.debug("purging pages for site %s: %s", site_id, reql)
+    site_logger = logger.bind(site_id=site_id)
+
+    site_logger.debug("purging pages for site", query=reql)
    result = reql.run()
-    logging.info("purged pages for site %s: %s", site_id, result)
+    site_logger.info("purged pages for site", result=result)

    reql = rr.table("sites").get(site_id).delete()
-    logging.debug("purging site %s: %s", site_id, reql)
+    site_logger.debug("purging site", query=reql)
    result = reql.run()
-    logging.info("purged site %s: %s", site_id, result)
+    site_logger.info("purged site", result=result)


 def _purge_job(rr, job_id):
+    job_logger = logger.bind(job_id=job_id)
+
    reql = rr.table("sites").get_all(job_id, index="job_id").get_field("id")
-    logging.debug("querying rethinkdb: %s", reql)
+    job_logger.debug("querying rethinkdb", query=reql)
    site_ids = list(reql.run())
    for site_id in site_ids:
        _purge_site(rr, site_id)

    reql = rr.table("jobs").get(job_id).delete()
-    logging.debug("purging job %s: %s", job_id, reql)
+    job_logger.debug("purging job", query=reql)
    result = reql.run()
-    logging.info("purged job %s: %s", job_id, result)
+    job_logger.info("purged job", result=result)


 def brozzler_list_captures(argv=None):
@ -1101,7 +1188,7 @@ def brozzler_list_captures(argv=None):

    if args.url_or_sha1[:5] == "sha1:":
        if args.prefix:
-            logging.warning(
+            logger.warning(
                "ignoring supplied --prefix option which does not apply "
                "to lookup by sha1"
            )
@ -1112,7 +1199,7 @@ def brozzler_list_captures(argv=None):
            [sha1base32, r.maxval, r.maxval],
            index="sha1_warc_type",
        )
-        logging.debug("querying rethinkdb: %s", reql)
+        logger.debug("querying rethinkdb", query=reql)
        results = reql.run()
    else:
        key = urlcanon.semantic(args.url_or_sha1).surt().decode("ascii")
@ -1135,7 +1222,7 @@ def brozzler_list_captures(argv=None):
            lambda capture: (capture["canon_surt"] >= key)
            & (capture["canon_surt"] <= end_key)
        )
-        logging.debug("querying rethinkdb: %s", reql)
+        logger.debug("querying rethinkdb", query=reql)
        results = reql.run()

    if args.yaml:
@ -1180,7 +1267,7 @@ def brozzler_stop_crawl(argv=None):
            job_id = args.job_id
        job = brozzler.Job.load(rr, job_id)
        if not job:
-            logging.fatal("job not found with id=%r", job_id)
+            logger.fatal("job not found with", id=job_id)
            sys.exit(1)
        job.stop_requested = doublethink.utcnow()
        job.save()
@ -1191,7 +1278,7 @@ def brozzler_stop_crawl(argv=None):
            site_id = args.site_id
        site = brozzler.Site.load(rr, site_id)
        if not site:
-            logging.fatal("site not found with id=%r", site_id)
+            logger.fatal("site not found with", id=site_id)
            sys.exit(1)
        site.stop_requested = doublethink.utcnow()
        site.save()
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -17,13 +17,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

-import logging
+import structlog
 import sys

+logger = structlog.get_logger()
+
 try:
    import flask
 except ImportError as e:
-    logging.critical(
+    logger.critical(
        '%s: %s\n\nYou might need to run "pip install '
        'brozzler[dashboard]".\nSee README.rst for more information.',
        type(e).__name__,
@ -77,7 +79,7 @@ def queued_count(site_id):
        )
        .count()
    )
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    count = reql.run()
    return flask.jsonify(count=count)

@ -85,7 +87,7 @@ def queued_count(site_id):
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
 def queue(site_id):
-    logging.debug("flask.request.args=%s", flask.request.args)
+    logger.debug("flask.request.args", args=flask.request.args)
    start = flask.request.args.get("start", 0)
    end = flask.request.args.get("end", start + 90)
    reql = rr.table("pages").between(
@ -93,7 +95,7 @@ def queue(site_id):
        [site_id, 0, False, r.maxval],
        index="priority_by_site",
    )[start:end]
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    queue_ = reql.run()
    return flask.jsonify(queue_=list(queue_))

@ -112,7 +114,7 @@ def page_count(site_id):
        )
        .count()
    )
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    count = reql.run()
    return flask.jsonify(count=count)

@ -130,7 +132,7 @@ def pages(site_id):
        )
        .order_by(index="least_hops")[start:end]
    )
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    pages_ = reql.run()
    return flask.jsonify(pages=list(pages_))

@ -139,7 +141,7 @@ def pages(site_id):
@app.route("/api/page/<page_id>")
 def page(page_id):
    reql = rr.table("pages").get(page_id)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    page_ = reql.run()
    return flask.jsonify(page_)

@ -148,7 +150,7 @@ def page(page_id):
@app.route("/api/page/<page_id>/yaml")
 def page_yaml(page_id):
    reql = rr.table("pages").get(page_id)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    page_ = reql.run()
    return app.response_class(
        yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
@ -159,7 +161,7 @@ def page_yaml(page_id):
@app.route("/api/site/<site_id>")
 def site(site_id):
    reql = rr.table("sites").get(site_id)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    s = reql.run()
    if "cookie_db" in s:
        s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
@ -170,7 +172,7 @@ def site(site_id):
@app.route("/api/site/<site_id>/yaml")
 def site_yaml(site_id):
    reql = rr.table("sites").get(site_id)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    site_ = reql.run()
    return app.response_class(
        yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
@ -180,7 +182,7 @@ def site_yaml(site_id):
@app.route("/api/stats/<bucket>")
 def stats(bucket):
    reql = rr.table("stats").get(bucket)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    stats_ = reql.run()
    return flask.jsonify(stats_)

@ -193,7 +195,7 @@ def sites(job_id):
    except ValueError:
        jid = job_id
    reql = rr.table("sites").get_all(jid, index="job_id")
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    sites_ = list(reql.run())
    # TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
    for s in sites_:
@ -206,7 +208,7 @@ def sites(job_id):
 def jobless_sites():
    # XXX inefficient (unindexed) query
    reql = rr.table("sites").filter(~r.row.has_fields("job_id"))
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    sites_ = list(reql.run())
    # TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
    for s in sites_:
@ -223,7 +225,7 @@ def job(job_id):
    except ValueError:
        jid = job_id
    reql = rr.table("jobs").get(jid)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    job_ = reql.run()
    return flask.jsonify(job_)

@ -236,7 +238,7 @@ def job_yaml(job_id):
    except ValueError:
        jid = job_id
    reql = rr.table("jobs").get(jid)
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    job_ = reql.run()
    return app.response_class(
        yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
@ -258,7 +260,7 @@ def services():
@app.route("/api/jobs")
 def jobs():
    reql = rr.table("jobs").order_by(r.desc("id"))
-    logging.debug("querying rethinkdb: %s", reql)
+    logger.debug("querying rethinkdb", query=reql)
    jobs_ = list(reql.run())
    return flask.jsonify(jobs=jobs_)

@ -313,13 +315,13 @@ try:
            return self.application

    def run(**options):
-        logging.info("running brozzler-dashboard using gunicorn")
+        logger.info("running brozzler-dashboard using gunicorn")
        GunicornBrozzlerDashboard(app, options).run()

 except ImportError:

    def run():
-        logging.info("running brozzler-dashboard using simple flask app.run")
+        logger.info("running brozzler-dashboard using simple flask app.run")
        app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])


--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -18,8 +18,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

+import structlog
 import sys
-import logging

 try:
    import warcprox
@ -30,11 +30,11 @@ try:
    import wsgiref.handlers
    import brozzler.dashboard
 except ImportError as e:
-    logging.critical(
+    structlog.get_logger().critical(
        '%s: %s\n\nYou might need to run "pip install '
        'brozzler[easy]".\nSee README.rst for more information.',
        type(e).__name__,
-        e,
+        exc_info=True,
    )
    sys.exit(1)
 import argparse
@ -156,7 +156,7 @@ class ThreadingWSGIServer(


 class BrozzlerEasyController:
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, args):
        self.stop = threading.Event()
@ -238,11 +238,14 @@ class BrozzlerEasyController:
        self.logger.info("starting brozzler-worker")
        self.brozzler_worker.start()

-        self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address)
+        self.logger.info(
+            "starting pywb", address="%s:%s" % self.pywb_httpd.server_address
+        )
        threading.Thread(target=self.pywb_httpd.serve_forever).start()

        self.logger.info(
-            "starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address
+            "starting brozzler-dashboard",
+            address="%s:%s" % self.dashboard_httpd.server_address,
        )
        threading.Thread(target=self.dashboard_httpd.serve_forever).start()

@ -307,8 +310,8 @@ class BrozzlerEasyController:
            state_strs.append(str(th))
            stack = traceback.format_stack(sys._current_frames()[th.ident])
            state_strs.append("".join(stack))
-        logging.warning(
-            "dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))
+        structlog.get_logger().warning(
+            "dumping state (caught signal)", signal=signum, state="\n".join(state_strs)
        )


--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -16,12 +16,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

-import logging
 import brozzler
 import random
 import time
 import datetime
 import rethinkdb as rdb
+import structlog
 import doublethink
 import urlcanon

@ -33,7 +33,7 @@ class UnexpectedDbResult(Exception):


 class RethinkDbFrontier:
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    def __init__(self, rr, shards=None, replicas=None):
        self.rr = rr
@ -42,15 +42,15 @@ class RethinkDbFrontier:
        self._ensure_db()

    def _ensure_db(self):
+        db_logger = self.logger.bind(dbname=self.rr.dbname)
+
        dbs = self.rr.db_list().run()
        if not self.rr.dbname in dbs:
-            self.logger.info("creating rethinkdb database %r", self.rr.dbname)
+            db_logger.info("creating rethinkdb database")
            self.rr.db_create(self.rr.dbname).run()
        tables = self.rr.table_list().run()
        if not "sites" in tables:
-            self.logger.info(
-                "creating rethinkdb table 'sites' in database %r", self.rr.dbname
-            )
+            db_logger.info("creating rethinkdb table 'sites' in database")
            self.rr.table_create(
                "sites", shards=self.shards, replicas=self.replicas
            ).run()
@ -59,9 +59,7 @@ class RethinkDbFrontier:
            ).run()
            self.rr.table("sites").index_create("job_id").run()
        if not "pages" in tables:
-            self.logger.info(
-                "creating rethinkdb table 'pages' in database %r", self.rr.dbname
-            )
+            db_logger.info("creating rethinkdb table 'pages' in database")
            self.rr.table_create(
                "pages", shards=self.shards, replicas=self.replicas
            ).run()
@ -81,9 +79,7 @@ class RethinkDbFrontier:
                [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
            ).run()
        if not "jobs" in tables:
-            self.logger.info(
-                "creating rethinkdb table 'jobs' in database %r", self.rr.dbname
-            )
+            db_logger.info("creating rethinkdb table 'jobs' in database")
            self.rr.table_create(
                "jobs", shards=self.shards, replicas=self.replicas
            ).run()
@ -108,7 +104,7 @@ class RethinkDbFrontier:
                    )

    def claim_sites(self, n=1):
-        self.logger.trace("claiming up to %s sites to brozzle", n)
+        self.logger.debug("claiming up to %s sites to brozzle", n)
        result = (
            self.rr.table("sites")
            .get_all(
@ -186,10 +182,10 @@ class RethinkDbFrontier:
            if result["changes"][i]["old_val"]["claimed"]:
                self.logger.warning(
                    "re-claimed site that was still marked 'claimed' "
-                    "because it was last claimed a long time ago "
-                    "at %s, and presumably some error stopped it from "
+                    "because it was last claimed a long time ago, "
+                    "and presumably some error stopped it from "
                    "being disclaimed",
-                    result["changes"][i]["old_val"]["last_claimed"],
+                    last_claimed=result["changes"][i]["old_val"]["last_claimed"],
                )
            site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
            sites.append(site)
@ -205,10 +201,10 @@ class RethinkDbFrontier:
        """
        if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit:
            self.logger.debug(
-                "site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s",
-                site.time_limit,
-                site.elapsed(),
-                site,
+                "site FINISHED_TIME_LIMIT!",
+                time_limit=site.time_limit,
+                elapsed=site.elapsed(),
+                site=site,
            )
            raise brozzler.ReachedTimeLimit

@ -273,7 +269,7 @@ class RethinkDbFrontier:
        """Raises brozzler.CrawlStopped if stop has been requested."""
        site.refresh()
        if site.stop_requested and site.stop_requested <= doublethink.utcnow():
-            self.logger.info("stop requested for site %s", site.id)
+            self.logger.info("stop requested for site", site_id=site.id)
            raise brozzler.CrawlStopped

        if site.job_id:
@ -283,7 +279,7 @@ class RethinkDbFrontier:
                and job.stop_requested
                and job.stop_requested <= doublethink.utcnow()
            ):
-                self.logger.info("stop requested for job %s", site.job_id)
+                self.logger.info("stop requested for job", job_id=site.job_id)
                raise brozzler.CrawlStopped

    def _maybe_finish_job(self, job_id):
@ -304,7 +300,7 @@ class RethinkDbFrontier:
                return False
            n += 1

-        self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
+        self.logger.info("all %s sites finished, job is FINISHED!", n, job_id=job.id)
        job.finish()
        job.save()
        return True
@ -320,7 +316,7 @@ class RethinkDbFrontier:
            self._maybe_finish_job(site.job_id)

    def disclaim_site(self, site, page=None):
-        self.logger.info("disclaiming %s", site)
+        self.logger.info("disclaiming", site=site)
        site.claimed = False
        site.last_disclaimed = doublethink.utcnow()
        if not page and not self.has_outstanding_pages(site):
@ -468,17 +464,16 @@ class RethinkDbFrontier:
            try:
                self.logger.debug("inserting/replacing batch of %s pages", len(batch))
                reql = self.rr.table("pages").insert(batch, conflict="replace")
-                self.logger.trace(
+                self.logger.debug(
                    'running query self.rr.table("pages").insert(%r, '
                    'conflict="replace")',
                    batch,
                )
                result = reql.run()
            except Exception as e:
-                self.logger.error(
+                self.logger.exception(
                    "problem inserting/replacing batch of %s pages",
                    len(batch),
-                    exc_info=True,
                )

        parent_page.outlinks = {}
@ -497,7 +492,7 @@ class RethinkDbFrontier:
        )

    def reached_limit(self, site, e):
-        self.logger.info("reached_limit site=%s e=%s", site, e)
+        self.logger.info("reached_limit", site=site, e=e)
        assert isinstance(e, brozzler.ReachedLimit)
        if (
            site.reached_limit
@ -530,7 +525,7 @@ class RethinkDbFrontier:
        )
        pages = list(results)
        if len(pages) > 1:
-            self.logger.warning("more than one seed page for site_id %s ?", site_id)
+            self.logger.warning("more than one seed page?", site_id=site_id)
        if len(pages) < 1:
            return None
        return brozzler.Page(self.rr, pages[0])
@ -550,8 +545,8 @@ class RethinkDbFrontier:
            [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval],
            index="priority_by_site",
        )
-        self.logger.trace("running query: %r", query)
+        self.logger.debug("running query", query=query)
        results = query.run()
        for result in results:
-            self.logger.trace("yielding result: %r", result)
+            self.logger.debug("yielding result", result=result)
            yield brozzler.Page(self.rr, result)
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -25,9 +25,9 @@ import datetime
 import doublethink
 import hashlib
 import json
-import logging
 import os
 import re
+import structlog
 import time
 import urlcanon
 import urllib
@ -37,6 +37,8 @@ import zlib
 from enum import Enum
 from typing import Optional

+logger = structlog.get_logger()
+

 def load_schema():
    schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
@ -84,7 +86,7 @@ def merge(a, b):

 def new_job_file(frontier, job_conf_file):
    """Returns new Job."""
-    logging.info("loading %s", job_conf_file)
+    logger.info("loading", job_conf_file=job_conf_file)
    with open(job_conf_file) as f:
        job_conf = yaml.safe_load(f)
        return new_job(frontier, job_conf)
@ -120,12 +122,12 @@ def new_job(frontier, job_conf):
    # insert in batches to avoid this error
    # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
    for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
-        logging.info("inserting batch of %s pages", len(batch))
+        logger.info("inserting batch of %s pages", len(batch))
        result = frontier.rr.table("pages").insert(batch).run()
    for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
-        logging.info("inserting batch of %s sites", len(batch))
+        logger.info("inserting batch of %s sites", len(batch))
        result = frontier.rr.table("sites").insert(batch).run()
-    logging.info("job %s fully started", job.id)
+    logger.info("job fully started", job_id=job.id)

    return job

@ -154,7 +156,7 @@ def new_seed_page(frontier, site):


 def new_site(frontier, site):
-    logging.info("new site %s", site)
+    logger.info("new site", site=site)
    site.id = site.id or str(uuid.uuid4())
    # insert the Page into the database before the Site, to avoid situation
    # where a brozzler worker immediately claims the site, finds no pages
@ -162,7 +164,7 @@ def new_site(frontier, site):
    try:
        page = new_seed_page(frontier, site)
        page.save()
-        logging.info("queued page %s", page)
+        logger.info("queued page", page=page)
    finally:
        # finally block because we want to insert the Site no matter what
        site.save()
@ -195,7 +197,7 @@ class ElapsedMixIn(object):


 class Job(doublethink.Document, ElapsedMixIn):
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
    table = "jobs"

    def populate_defaults(self):
@ -217,9 +219,9 @@ class Job(doublethink.Document, ElapsedMixIn):
    def finish(self):
        if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
            self.logger.error(
-                "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s",
-                self.status,
-                self.starts_and_stops[-1]["stop"],
+                "job is already finished",
+                status=self.status,
+                stop=self.starts_and_stops[-1]["stop"],
            )
        self.status = "FINISHED"
        self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
@ -246,7 +248,7 @@ class VideoCaptureOptions(Enum):


 class Site(doublethink.Document, ElapsedMixIn):
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
    table = "sites"

    def populate_defaults(self):
@ -304,7 +306,7 @@ class Site(doublethink.Document, ElapsedMixIn):
            if set(rule.keys()) == {"ssurt"}
        )
        if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
-            self.logger.info("adding ssurt %s to scope accept rules", ssurt)
+            self.logger.info("adding ssurt to scope accept rules", ssurt=ssurt)
            self.scope["accepts"].append({"ssurt": ssurt})

    def note_seed_redirect(self, url):
@ -402,7 +404,7 @@ class Site(doublethink.Document, ElapsedMixIn):


 class Page(doublethink.Document):
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
    table = "pages"

    @staticmethod
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -19,7 +19,9 @@ limitations under the License.
 """

 import sys
-import logging
+import structlog
+
+logger = structlog.get_logger()

 try:
    import pywb.apps.cli
@ -30,7 +32,7 @@ try:
    import pywb.framework.basehandlers
    import pywb.rewrite.wburl
 except ImportError as e:
-    logging.critical(
+    logger.critical(
        '%s: %s\n\nYou might need to run "pip install '
        'brozzler[easy]".\nSee README.rst for more information.',
        type(e).__name__,
@ -111,7 +113,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
        )
        if cdx_query.limit:
            reql = reql.limit(cdx_query.limit)
-        logging.debug("rethinkdb query: %s", reql)
+        logger.debug("rethinkdb query", query=reql)
        results = reql.run()
        return results

--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -23,12 +23,12 @@ limitations under the License.
 """

 import json
-import logging
 import brozzler
 import reppy
 import reppy.cache
 import reppy.parser
 import requests
+import structlog

 __all__ = ["is_permitted_by_robots"]

@ -119,10 +119,9 @@ def is_permitted_by_robots(site, url, proxy=None):
            # reppy has wrapped an exception that we want to bubble up
            raise brozzler.ProxyError(e)
        else:
-            logging.warning(
-                "returning true (permitted) after problem fetching "
-                "robots.txt for %r: %r",
-                url,
-                e,
+            structlog.get_logger().warning(
+                "returning true (permitted) after problem fetching " "robots.txt",
+                url=url,
+                exception=e,
            )
            return True
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -18,7 +18,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

-import logging
 import brozzler
 import brozzler.browser
 from brozzler.model import VideoCaptureOptions
@ -32,6 +31,7 @@ import io
 import socket
 import random
 import requests
+import structlog
 import urllib3
 from urllib3.exceptions import TimeoutError, ProxyError
 import doublethink
@ -46,7 +46,7 @@ r = rdb.RethinkDB()


 class BrozzlerWorker:
-    logger = logging.getLogger(__module__ + "." + __qualname__)
+    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

    # 3⅓ min heartbeat interval => 10 min ttl
    # This is kind of a long time, because `frontier.claim_sites()`, which runs
@ -125,7 +125,7 @@ class BrozzlerWorker:
                self._metrics_port, self._registry_url, self._env
            )
        else:
-            logging.warning(
+            self.logger.warning(
                "not starting prometheus scrape endpoint: metrics_port is undefined"
            )

@ -173,9 +173,9 @@ class BrozzlerWorker:
            site.proxy = "%s:%s" % (svc["host"], svc["port"])
            site.save()
            self.logger.info(
-                "chose warcprox instance %r from service registry for %r",
-                site.proxy,
-                site,
+                "chose warcprox instance from service registry",
+                instance=site.proxy,
+                registry=site,
            )
            return site.proxy
        return None
@ -189,7 +189,7 @@ class BrozzlerWorker:
                    self._proxy_is_warcprox = status["role"] == "warcprox"
                except Exception as e:
                    self._proxy_is_warcprox = False
-                logging.info(
+                self.logger.info(
                    "%s %s warcprox",
                    self._proxy,
                    "IS" if self._proxy_is_warcprox else "IS NOT",
@ -227,18 +227,18 @@ class BrozzlerWorker:
            with urllib.request.urlopen(request, timeout=900) as response:
                if response.getcode() != 204:
                    self.logger.warning(
-                        'got "%s %s" response on warcprox '
+                        "got unexpected response on warcprox "
                        "WARCPROX_WRITE_RECORD request (expected 204)",
-                        response.getcode(),
-                        response.reason,
+                        code=response.getcode(),
+                        reason=response.reason,
                    )
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warning(
-                'got "%s %s" response on warcprox '
+                "got unexpected response on warcprox "
                "WARCPROX_WRITE_RECORD request (expected 204)",
-                e.getcode(),
-                e.info(),
+                code=e.getcode(),
+                reason=e.info(),
            )
            return request, None
        except urllib.error.URLError as e:
@ -271,26 +271,27 @@ class BrozzlerWorker:
        on_request=None,
        enable_youtube_dl=True,
    ):
-        self.logger.info("brozzling {}".format(page))
+        page_logger = self.logger.bind(page=page)
+        page_logger.info("brozzling")
        outlinks = set()

        page_headers = self._get_page_headers(site, page)

        if not self._needs_browsing(page_headers):
-            self.logger.info("needs fetch: %s", page)
+            page_logger.info("needs fetch")
            if site.pdfs_only and not self._is_pdf(page_headers):
                self.logger.info("skipping non-PDF content: PDFs only option enabled")
            elif site.video_capture in [
                VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
                VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
            ] and self._is_video_type(page_headers):
-                self.logger.info(
+                page_logger.info(
                    "skipping video content: video MIME type capture disabled for site"
                )
            else:
                self._fetch_url(site, page=page)
        else:
-            self.logger.info("needs browsing: %s", page)
+            page_logger.info("needs browsing")
            try:
                browser_outlinks = self._browse_page(
                    browser, site, page, on_screenshot, on_request
@ -300,7 +301,7 @@ class BrozzlerWorker:
                if status_code in [502, 504]:
                    raise brozzler.PageConnectionError()
            except brozzler.PageInterstitialShown:
-                self.logger.info("page interstitial shown (http auth): %s", page)
+                page_logger.info("page interstitial shown (http auth)")

            if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
                try:
@ -316,10 +317,7 @@ class BrozzlerWorker:
                except brozzler.ProxyError:
                    raise
                except brozzler.VideoExtractorError as e:
-                    logging.error(
-                        "error extracting video info: %s",
-                        e,
-                    )
+                    self.logger.exception("error extracting video info")
                except Exception as e:
                    if (
                        hasattr(e, "exc_info")
@ -328,26 +326,25 @@ class BrozzlerWorker:
                        and e.exc_info[1].code == 430
                    ):
                        self.logger.info(
-                            "youtube-dl got %s %s processing %s",
-                            e.exc_info[1].code,
-                            e.exc_info[1].msg,
-                            page.url,
+                            "youtube-dl encountered an error",
+                            code=e.exc_info[1].code,
+                            message=e.exc_info[1].msg,
+                            url=page.url,
                        )
                    else:
-                        self.logger.error(
-                            "youtube_dl raised exception on %s", page, exc_info=True
-                        )
+                        self.logger.exception("youtube_dl raised exception", page=page)
        return outlinks

    @metrics.brozzler_header_processing_duration_seconds.time()
    @metrics.brozzler_in_progress_headers.track_inprogress()
    def _get_page_headers(self, site, page):
+        url_logger = self.logger.bind(url=page.url)
        # bypassing warcprox, requests' stream=True defers downloading the body of the response
        # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
        try:
            user_agent = site.get("user_agent")
            headers = {"User-Agent": user_agent} if user_agent else {}
-            self.logger.info("getting page headers for %s", page.url)
+            url_logger.info("getting page headers")
            with requests.get(
                page.url,
                stream=True,
@ -357,11 +354,9 @@ class BrozzlerWorker:
            ) as r:
                return r.headers
        except requests.exceptions.Timeout as e:
-            self.logger.warning(
-                "Timed out trying to get headers for %s: %s", page.url, e
-            )
+            url_logger.warning("Timed out trying to get headers", exc_info=True)
        except requests.exceptions.RequestException as e:
-            self.logger.warning("Failed to get headers for %s: %s", page.url, e)
+            url_logger.warning("Failed to get headers", exc_info=True)
        return {}

    def _needs_browsing(self, page_headers) -> bool:
@ -402,10 +397,9 @@ class BrozzlerWorker:
                on_screenshot(screenshot_jpeg)
            if self._using_warcprox(site):
                self.logger.info(
-                    "sending WARCPROX_WRITE_RECORD request to %s with "
-                    "screenshot for %s",
-                    self._proxy_for(site),
-                    page,
+                    "sending WARCPROX_WRITE_RECORD request",
+                    proxy=self._proxy_for(site),
+                    screenshot_for_page=page,
                )
                thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
                self._warcprox_write_record(
@ -452,7 +446,7 @@ class BrozzlerWorker:
                    video["content-length"] = int(response_headers["content-length"])
                if "content-range" in response_headers:
                    video["content-range"] = response_headers["content-range"]
-                logging.debug("embedded video %s", video)
+                self.logger.debug("embedded video", video=video)
                if not "videos" in page:
                    page.videos = []
                page.videos.append(video)
@ -461,11 +455,11 @@ class BrozzlerWorker:

        def _on_service_worker_version_updated(chrome_msg):
            # https://github.com/internetarchive/brozzler/issues/140
-            self.logger.trace("%r", chrome_msg)
+            self.logger.debug("service worker updated", chrome_msg=chrome_msg)
            if chrome_msg.get("params", {}).get("versions"):
                url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
                if url and url.startswith("http") and url not in sw_fetched:
-                    self.logger.info("fetching service worker script %s", url)
+                    self.logger.info("fetching service worker script", url=url)
                    self._fetch_url(site, url=url)
                    sw_fetched.add(url)

@ -520,7 +514,7 @@ class BrozzlerWorker:
        headers = {"User-Agent": user_agent} if user_agent else {}
        headers.update(site.extra_headers(page))

-        self.logger.info("fetching url %s", url)
+        self.logger.info("fetching url", url=url)
        try:
            # response is ignored
            http.request(
@ -530,17 +524,18 @@ class BrozzlerWorker:
                timeout=self.FETCH_URL_TIMEOUT,
                retries=False,
            )
-            self.logger.info("Completed fetching url %s", url)
+            self.logger.info("Completed fetching url", url=url)
        except TimeoutError as e:
-            self.logger.warning("Timed out fetching %s", url)
+            self.logger.warning("Timed out fetching url", url=url)
            raise brozzler.PageConnectionError() from e
        except ProxyError as e:
            raise brozzler.ProxyError("proxy error fetching %s" % url) from e
        except urllib3.exceptions.RequestError as e:
-            self.logger.warning("Failed to fetch url %s: %s", url, e)
+            self.logger.warning("Failed to fetch url", url=url, exc_info=True)
            raise brozzler.PageConnectionError() from e

    def brozzle_site(self, browser, site):
+        site_logger = self.logger.bind(site=site)
        try:
            site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
            site.save()
@ -550,9 +545,7 @@ class BrozzlerWorker:
            self._frontier.honor_stop_request(site)
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
-            self.logger.info(
-                "brozzling site (proxy=%r) %s", self._proxy_for(site), site
-            )
+            site_logger.info("brozzling site", proxy=self._proxy_for(site))
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
                self._frontier.enforce_time_limit(site)
@ -564,7 +557,7 @@ class BrozzlerWorker:
                if page.needs_robots_check and not brozzler.is_permitted_by_robots(
                    site, page.url, self._proxy_for(site)
                ):
-                    logging.warning("page %s is blocked by robots.txt", page.url)
+                    self.logger.warning("page is blocked by robots.txt", url=page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
@ -580,7 +573,7 @@ class BrozzlerWorker:
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except brozzler.NothingToClaim:
-            self.logger.info("no pages left for site %s", site)
+            site_logger.info("no pages left for site")
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
        except brozzler.ReachedTimeLimit as e:
@ -591,29 +584,24 @@ class BrozzlerWorker:
        #     self.logger.info("{} shut down".format(browser))
        except brozzler.ProxyError as e:
            if self._warcprox_auto:
-                logging.error(
-                    "proxy error (site.proxy=%s), will try to choose a "
-                    "healthy instance next time site is brozzled: %s",
-                    site.proxy,
-                    e,
+                self.logger.exception(
+                    "proxy error, will try to choose a "
+                    "healthy instance next time site is brozzled",
+                    site_proxy=site.proxy,
                )
                site.proxy = None
            else:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
-                logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
+                self.logger.exception("proxy error", self_proxy=self._proxy)
        except (brozzler.PageConnectionError, Exception) as e:
            if isinstance(e, brozzler.PageConnectionError):
-                self.logger.error(
-                    "Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
-                    site,
-                    page,
-                    exc_info=True,
+                site_logger.exception(
+                    "Page status code possibly indicates connection failure between host and warcprox",
+                    page=page,
                )
            else:
-                self.logger.error(
-                    "unexpected exception site=%r page=%r", site, page, exc_info=True
-                )
+                site_logger.exception("unexpected exception", page=page)
            if page:
                # Calculate backoff in seconds based on number of failed attempts.
                # Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
@ -624,10 +612,10 @@ class BrozzlerWorker:
                page.failed_attempts = (page.failed_attempts or 0) + 1
                if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
                    self.logger.info(
-                        'marking page "completed" after %s unexpected '
-                        "exceptions attempting to brozzle %s",
-                        page.failed_attempts,
-                        page,
+                        'marking page "completed" after several unexpected '
+                        "exceptions attempting to brozzle",
+                        failed_attempts=page.failed_attempts,
+                        page=page,
                    )
                    self._frontier.completed_page(site, page)
                    page = None
@ -665,13 +653,11 @@ class BrozzlerWorker:

        try:
            self.status_info = self._service_registry.heartbeat(status_info)
-            self.logger.trace("status in service registry: %s", self.status_info)
+            self.logger.debug("status in service registry", status=self.status_info)
        except r.ReqlError as e:
-            self.logger.error(
-                "failed to send heartbeat and update service registry "
-                "with info %s: %s",
-                status_info,
-                e,
+            self.logger.exception(
+                "failed to send heartbeat and update service registry",
+                info=status_info,
            )

    def _service_heartbeat_if_due(self):
@ -719,9 +705,7 @@ class BrozzlerWorker:
                self._browser_pool.release(browsers[i])

    def run(self):
-        self.logger.notice(
-            "brozzler %s - brozzler-worker starting", brozzler.__version__
-        )
+        self.logger.warn("brozzler %s - brozzler-worker starting", brozzler.__version__)
        last_nothing_to_claim = 0
        try:
            while not self._shutdown.is_set():
@ -730,20 +714,20 @@ class BrozzlerWorker:
                    try:
                        self._start_browsing_some_sites()
                    except brozzler.browser.NoBrowsersAvailable:
-                        logging.trace("all %s browsers are in use", self._max_browsers)
+                        self.logger.debug(
+                            "all browsers are in use", max_browsers=self._max_browsers
+                        )
                    except brozzler.NothingToClaim:
                        last_nothing_to_claim = time.time()
-                        logging.trace(
+                        self.logger.debug(
                            "nothing to claim, all available active sites "
                            "are already claimed by a brozzler worker"
                        )
                time.sleep(0.5)

-            self.logger.notice("shutdown requested")
+            self.logger.warn("shutdown requested")
        except r.ReqlError as e:
-            self.logger.error(
-                "caught rethinkdb exception, will try to proceed", exc_info=True
-            )
+            self.logger.exception("caught rethinkdb exception, will try to proceed")
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except:
@ -755,12 +739,11 @@ class BrozzlerWorker:
                try:
                    self._service_registry.unregister(self.status_info["id"])
                except:
-                    self.logger.error(
-                        "failed to unregister from service registry", exc_info=True
-                    )
+                    self.logger.exception("failed to unregister from service registry")

            self.logger.info(
-                "shutting down %s brozzling threads", len(self._browsing_threads)
+                "shutting down brozzling threads",
+                thread_count=len(self._browsing_threads),
            )
            with self._browsing_threads_lock:
                for th in self._browsing_threads:
@ -780,6 +763,7 @@ class BrozzlerWorker:
                )
                return
            self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
+            self.logger = self.logger.bind(thread=self._thread)
            self._thread.start()

    def shutdown_now(self):
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -16,7 +16,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """

-import logging
 import yt_dlp
 from yt_dlp.utils import match_filter_func, ExtractorError
 import brozzler
@ -31,6 +30,7 @@ import datetime
 from . import metrics

 import random
+import structlog
 import threading
 import traceback
 import doublethink
@ -44,17 +44,20 @@ YTDLP_WAIT = 10
 YTDLP_MAX_REDIRECTS = 5


+logger = structlog.get_logger()
+
+
 def should_ytdlp(site, page, page_status):
    # called only after we've passed needs_browsing() check

    if page_status != 200:
-        logging.info("skipping ytdlp: non-200 page status %s", page_status)
+        logger.info("skipping ytdlp: non-200 page status %s", page_status)
        return False
    if site.video_capture in [
        VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
        VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
    ]:
-        logging.info("skipping ytdlp: site has video capture disabled")
+        logger.info("skipping ytdlp: site has video capture disabled")
        return False

    ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -104,7 +107,13 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
    """

    class _YoutubeDL(yt_dlp.YoutubeDL):
-        logger = logging.getLogger(__module__ + "." + __qualname__)
+        logger = structlog.get_logger(__module__ + "." + __qualname__)
+
+        def __init__(self, url, params=None, auto_init=True):
+            super().__init__(params, auto_init)
+
+            self.url = url
+            self.logger = self.logger.bind(url=url)

        def process_ie_result(self, ie_result, download=True, extra_info=None):
            if extra_info is None:
@ -114,7 +123,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
            if result_type in ("url", "url_transparent"):
                if "extraction_depth" in extra_info:
                    self.logger.info(
-                        f"Following redirect URL: {ie_result['url']} extraction_depth: {extra_info['extraction_depth']}"
+                        f"Following redirect",
+                        redirect_url=ie_result["url"],
+                        extraction_depth=extra_info["extraction_depth"],
                    )
                    extra_info["extraction_depth"] = 1 + extra_info.get(
                        "extraction_depth", 0
@ -131,8 +142,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
+            extract_context = self.logger.bind(extractor=ie.IE_NAME)
            if ie_result.get("_type") == "playlist":
-                self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
+                extract_context.info("found playlist")
                if ie.IE_NAME in {
                    "youtube:playlist",
                    "youtube:tab",
@ -147,22 +159,20 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                    try:
                        ie_result["entries_no_dl"] = list(ie_result["entries"])
                    except Exception as e:
-                        self.logger.warning(
-                            "failed to unroll ie_result['entries']? for %s, %s; exception %s",
-                            ie.IE_NAME,
-                            url,
-                            e,
+                        extract_context.warning(
+                            "failed to unroll entries ie_result['entries']?",
+                            exc_info=True,
                        )
                        ie_result["entries_no_dl"] = []
                    ie_result["entries"] = []
                    self.logger.info(
-                        "not downloading %s media files from this "
+                        "not downloading media files from this "
                        "playlist because we expect to capture them from "
                        "individual watch/track/detail pages",
-                        len(ie_result["entries_no_dl"]),
+                        media_file_count=len(ie_result["entries_no_dl"]),
                    )
            else:
-                self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
+                extract_context.info("found a download")

        def _push_video_to_warcprox(self, site, info_dict, postprocessor):
            # 220211 update: does yt-dlp supply content-type? no, not as such
@ -180,7 +190,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
                    mimetype = magic.from_file(info_dict["filepath"], mime=True)
                except ImportError as e:
                    mimetype = "video/%s" % info_dict["ext"]
-                    self.logger.warning("guessing mimetype %s because %r", mimetype, e)
+                    self.logger.warning(
+                        "guessing mimetype due to error",
+                        mimetype=mimetype,
+                        exc_info=True,
+                    )

            # youtube watch page postprocessor is MoveFiles

@ -198,12 +212,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):

            size = os.path.getsize(info_dict["filepath"])
            self.logger.info(
-                "pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
-                info_dict["format"],
-                mimetype,
-                size,
-                worker._proxy_for(site),
-                url,
+                "pushing video to warcprox",
+                format=info_dict["format"],
+                mimetype=mimetype,
+                size=size,
+                warcprox=worker._proxy_for(site),
            )
            with open(info_dict["filepath"], "rb") as f:
                # include content-length header to avoid chunked
@ -240,23 +253,23 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
            ):
                worker.logger.debug(
                    "heartbeating site.last_claimed to prevent another "
-                    "brozzler-worker claiming this site id=%r",
-                    site.id,
+                    "brozzler-worker claiming this site",
+                    id=site.id,
                )
                site.last_claimed = doublethink.utcnow()
                site.save()
        except:
            worker.logger.debug(
-                "problem heartbeating site.last_claimed site id=%r",
-                site.id,
+                "problem heartbeating site.last_claimed site",
+                id=site.id,
                exc_info=True,
            )

    def ydl_postprocess_hook(d):
        if d["status"] == "finished":
-            worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
            worker.logger.info(
-                "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
+                "[ydl_postprocess_hook] Finished postprocessing",
+                postprocessor=d["postprocessor"],
            )
            is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"])

@ -290,7 +303,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
        # --cache-dir local or..
        # this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal?
        "cache_dir": "/home/archiveit",
-        "logger": logging.getLogger("yt_dlp"),
+        "logger": logger,
        "verbose": False,
        "quiet": False,
        # recommended to avoid bot detection
@ -306,17 +319,16 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
        ytdlp_proxy_for_logs = (
            ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
        )
-        logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)
+        logger.info("using yt-dlp proxy ...", proxy=ytdlp_proxy_for_logs)

    # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges
    # if worker._proxy_for(site):
    #    ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))

-    ydl = _YoutubeDL(ydl_opts)
+    ydl = _YoutubeDL(ytdlp_url, params=ydl_opts)
    if site.extra_headers():
        ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
    ydl.pushed_videos = []
-    ydl.url = ytdlp_url
    ydl.is_youtube_host = is_youtube_host

    return ydl
@ -336,7 +348,7 @@ def _remember_videos(page, pushed_videos=None):
            "content-type": pushed_video["content-type"],
            "content-length": pushed_video["content-length"],
        }
-        logging.debug("pushed video %s", video)
+        logger.debug("pushed video", video=video)
        page.videos.append(video)


@ -345,7 +357,7 @@ def _try_youtube_dl(worker, ydl, site, page):
    attempt = 0
    while attempt < max_attempts:
        try:
-            logging.info("trying yt-dlp on %s", ydl.url)
+            logger.info("trying yt-dlp", url=ydl.url)
            # should_download_vid = not ydl.is_youtube_host
            # then
            # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid)
@ -386,15 +398,18 @@ def _try_youtube_dl(worker, ydl, site, page):
                # and others...
                attempt += 1
                if attempt == max_attempts:
-                    logging.warning(
-                        "Failed after %s attempt(s). Error: %s", max_attempts, e
+                    logger.warning(
+                        "Failed after %s attempt(s)",
+                        max_attempts,
+                        attempts=max_attempts,
+                        exc_info=True,
                    )
                    raise brozzler.VideoExtractorError(
                        "yt-dlp hit error extracting info for %s" % ydl.url
                    )
                else:
                    retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1)))
-                    logging.info(
+                    logger.info(
                        "Attempt %s failed. Retrying in %s seconds...",
                        attempt,
                        retry_wait,
@ -405,15 +420,14 @@ def _try_youtube_dl(worker, ydl, site, page):
            "yt-dlp hit unknown error extracting info for %s" % ydl.url
        )

-    logging.info("ytdlp completed successfully")
+    logger.info("ytdlp completed successfully")

    _remember_videos(page, ydl.pushed_videos)
    if worker._using_warcprox(site):
        info_json = json.dumps(ie_result, sort_keys=True, indent=4)
-        logging.info(
-            "sending WARCPROX_WRITE_RECORD request to warcprox "
-            "with yt-dlp json for %s",
-            ydl.url,
+        logger.info(
+            "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json",
+            url=ydl.url,
        )
        worker._warcprox_write_record(
            warcprox_address=worker._proxy_for(site),
@ -444,7 +458,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
    with tempfile.TemporaryDirectory(
        prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
    ) as tempdir:
-        logging.info("tempdir for yt-dlp: %s", tempdir)
+        logger.info("tempdir for yt-dlp", tempdir=tempdir)
        ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints)
        ie_result = _try_youtube_dl(worker, ydl, site, page)
        outlinks = set()
--- a/setup.py
+++ b/setup.py
@ -76,6 +76,7 @@ setuptools.setup(
        "cryptography>=2.3",
        "python-magic>=0.4.15",
        "prometheus-client>=0.20.0",
+        "structlog>=25.1.0",
    ],
    extras_require={
        "yt-dlp": ["yt-dlp>=2024.7.25"],
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -31,11 +31,14 @@ import datetime
 import requests
 import subprocess
 import http.server
-import logging
+import structlog
 import sys
 import warcprox


+logger = structlog.get_logger()
+
+
 # https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
 def _local_address():
    import socket
@ -70,11 +73,11 @@ def stop_service(service):
 def httpd(request):
    class RequestHandler(http.server.SimpleHTTPRequestHandler):
        def do_POST(self):
-            logging.info("\n%s\n%s", self.requestline, self.headers)
+            logger.info("\n%s\n%s", self.requestline, self.headers)
            self.do_GET()

        def do_GET(self):
-            logging.info("\n%s\n%s", self.requestline, self.headers)
+            logger.info("\n%s\n%s", self.requestline, self.headers)
            if self.path == "/site5/redirect/":
                self.send_response(303, "See other")
                self.send_header("Connection", "close")
@ -270,7 +273,7 @@ def test_proxy_non_warcprox(httpd):
        def do_HEAD(self):
            if not hasattr(self.server, "requests"):
                self.server.requests = []
-            logging.info("%s %s", self.command, self.path)
+            logger.info("%s %s", self.command, self.path)
            self.server.requests.append("%s %s" % (self.command, self.path))
            response = urllib.request.urlopen(self.path)
            self.wfile.write(
@ -292,7 +295,7 @@ def test_proxy_non_warcprox(httpd):
        def do_WARCPROX_WRITE_RECORD(self):
            if not hasattr(self.server, "requests"):
                self.server.requests = []
-            logging.info("%s %s", self.command, self.path)
+            logger.info("%s %s", self.command, self.path)
            self.send_error(400)

    proxy = http.server.HTTPServer(("localhost", 0), DumbProxyRequestHandler)
@ -826,7 +829,7 @@ def test_warcprox_outage_resiliency(httpd):
        try:
            stop_service("warcprox")
        except Exception as e:
-            logging.warning("problem stopping warcprox service: %s", e)
+            logger.warning("problem stopping warcprox service: %s", exc_info=True)

        # queue the site for brozzling
        brozzler.new_site(frontier, site)
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -24,7 +24,6 @@ import os
 import brozzler
 import brozzler.chrome
 import brozzler.ydl
-import logging
 import yaml
 import datetime
 import requests
@ -36,15 +35,6 @@ import sys
 import threading
 from unittest import mock

-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format=(
-        "%(asctime)s %(process)d %(levelname)s %(threadName)s "
-        "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
-    ),
-)
-

@pytest.fixture(scope="module")
 def httpd(request):