From ffeaee7a014c1685d971e74b99980093d91bafc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Tue, 25 Feb 2025 14:18:54 -0800 Subject: [PATCH 1/4] chore: use ruff for formatting There are a few minor changes here compared to black; it flagged unnecessary string concatenations, and has slightly different opinions on line length. --- .github/workflows/python-formatting.yml | 4 ++-- Makefile | 4 ++-- brozzler/chrome.py | 2 +- brozzler/cli.py | 9 ++++----- brozzler/dashboard/__init__.py | 2 +- brozzler/easy.py | 7 +++---- brozzler/pywb.py | 4 +--- brozzler/robots.py | 2 +- brozzler/worker.py | 4 ++-- brozzler/ydl.py | 2 +- vagrant/vagrant-brozzler-new-site.py | 2 +- 11 files changed, 19 insertions(+), 23 deletions(-) diff --git a/.github/workflows/python-formatting.yml b/.github/workflows/python-formatting.yml index afbdff5..1a94232 100644 --- a/.github/workflows/python-formatting.yml +++ b/.github/workflows/python-formatting.yml @@ -22,10 +22,10 @@ jobs: - name: Create virtual environment run: python -m venv venv - - name: Install black + - name: Install ruff run: | ./venv/bin/pip install --upgrade pip - ./venv/bin/pip install black + ./venv/bin/pip install ruff - name: Run formatting check run: make ck-format diff --git a/Makefile b/Makefile index f99dcc9..ae16af0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: format format: - venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 . + venv/bin/ruff format --target-version py37 . .PHONY: ck-format ck-format: - venv/bin/black --check . + venv/bin/ruff format --check --target-version py37 . diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 140ffd7..1b630cc 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -367,7 +367,7 @@ class Chrome: os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() pid_logger.warning( - "chrome reaped after killing with " "SIGKILL", + "chrome reaped after killing with SIGKILL", status=status, ) diff --git a/brozzler/cli.py b/brozzler/cli.py index e49c5aa..73c002d 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -953,7 +953,7 @@ def brozzler_list_pages(argv=None): "--claimed", dest="claimed", action="store_true", - help=("limit to pages that are currently claimed by a brozzler " "worker"), + help=("limit to pages that are currently claimed by a brozzler worker"), ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -1024,22 +1024,21 @@ def brozzler_purge(argv=None): dest="job", metavar="JOB_ID", help=( - "purge crawl state from rethinkdb for a job, including all " - "sites and pages" + "purge crawl state from rethinkdb for a job, including all sites and pages" ), ) group.add_argument( "--site", dest="site", metavar="SITE_ID", - help=("purge crawl state from rethinkdb for a site, including all " "pages"), + help=("purge crawl state from rethinkdb for a site, including all pages"), ) group.add_argument( "--finished-before", dest="finished_before", metavar="YYYY-MM-DD", help=( - "purge crawl state from rethinkdb for a jobs that ended " "before this date" + "purge crawl state from rethinkdb for a jobs that ended before this date" ), ) arg_parser.add_argument( diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index fcac6c5..e623a01 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -334,7 +334,7 @@ def main(argv=None): prog=os.path.basename(argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=( - "brozzler-dashboard - web application for viewing brozzler " "crawl status" + "brozzler-dashboard - web application for viewing brozzler crawl status" ), epilog=( "brozzler-dashboard has no command line options, but can be " diff --git a/brozzler/easy.py b/brozzler/easy.py index 28a6c14..05f01a0 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -81,8 +81,7 @@ def _build_arg_parser(argv=None): dest="cacert", default="./%s-warcprox-ca.pem" % socket.gethostname(), help=( - "warcprox CA certificate file; if file does not exist, it " - "will be created" + "warcprox CA certificate file; if file does not exist, it will be created" ), ) arg_parser.add_argument( @@ -95,7 +94,7 @@ def _build_arg_parser(argv=None): "--onion-tor-socks-proxy", dest="onion_tor_socks_proxy", default=None, - help=("host:port of tor socks proxy, used only to connect to " ".onion sites"), + help=("host:port of tor socks proxy, used only to connect to .onion sites"), ) # brozzler-worker args @@ -112,7 +111,7 @@ def _build_arg_parser(argv=None): dest="max_browsers", type=int, default=1, - help=("max number of chrome instances simultaneously " "browsing pages"), + help=("max number of chrome instances simultaneously browsing pages"), ) # pywb args diff --git a/brozzler/pywb.py b/brozzler/pywb.py index de0fa44..5d714d0 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -447,8 +447,6 @@ def main(argv=sys.argv): wayback_cli = BrozzlerWaybackCli( args=argv[1:], default_port=8880, - desc=( - "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)" - ), + desc=("brozzler-wayback - pywb wayback (monkey-patched for use with brozzler)"), ) wayback_cli.run() diff --git a/brozzler/robots.py b/brozzler/robots.py index 0c9d699..2d3751e 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -120,7 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None): raise brozzler.ProxyError(e) else: structlog.get_logger(logger_name=__name__).warning( - "returning true (permitted) after problem fetching " "robots.txt", + "returning true (permitted) after problem fetching robots.txt", url=url, raised_exception=e, ) diff --git a/brozzler/worker.py b/brozzler/worker.py index 7a20658..b8befef 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -169,7 +169,7 @@ class BrozzlerWorker: svc = self._choose_warcprox() if svc is None: raise brozzler.ProxyError( - "no available instances of warcprox in the service " "registry" + "no available instances of warcprox in the service registry" ) site.proxy = "%s:%s" % (svc["host"], svc["port"]) site.save() @@ -735,7 +735,7 @@ class BrozzlerWorker: with self._start_stop_lock: if self._thread: self.logger.warning( - "ignoring start request because self._thread is " "not None" + "ignoring start request because self._thread is not None" ) return self._thread = threading.Thread(target=self.run, name="BrozzlerWorker") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index de88dda..7fab1f7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -434,7 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page): if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logger.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json", + "sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json", url=ydl.url, ) worker._warcprox_write_record( diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 244bf6b..5e7503a 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -82,7 +82,7 @@ def main(argv=[]): os.chdir(os.path.dirname(__file__)) cmd = ( - "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s" + "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site %s %s" ) % (" ".join(options), args.seed) subprocess.call(["vagrant", "ssh", "--", cmd]) From f384d0b8305867ff85017b4be040bb14eef7ed8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 5 Mar 2025 10:01:27 -0800 Subject: [PATCH 2/4] deps: add dev deps to pyproject.toml --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 9880da2..cefe99c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,3 +25,9 @@ Issues = "https://github.com/internetarchive/brozzler/issues" [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[dependency-groups] +dev = [ + "pytest>=8.3.5", + "ruff>=0.9.9" +] From af0f3ed378d91e8e9dcb04f2fa315e867b5d8017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Mon, 24 Feb 2025 14:53:13 -0800 Subject: [PATCH 3/4] CLI: enable log prefixing This adds a commandline option which enables log level prefixing. These prefixes enable log level-based filtering in journalctl when present so long as logs are going to the journal, and `SyslogLevelPrefix=` is set to `true` (which it is by default). For documentation: https://manpages.debian.org/testing/libsystemd-dev/sd-daemon.3.en.html --- brozzler/cli.py | 62 ++++++++++++++++++++++++++++++----------- tests/test_brozzling.py | 4 ++- tests/test_frontier.py | 4 ++- 3 files changed, 52 insertions(+), 18 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 73c002d..bf1323d 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -74,6 +74,12 @@ def add_common_options(arg_parser, argv=None): const=logging.DEBUG, help=("very verbose logging"), ) + arg_parser.add_argument( + "--syslogd-log-prefix", + dest="syslogd_log_prefix", + action="store_true", + help="add syslogd log level prefix for journalctl filtering", + ) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) @@ -131,24 +137,48 @@ def decorate_logger_name(a, b, event_dict): return event_dict +# https://manpages.debian.org/testing/libsystemd-dev/sd-daemon.3.en.html +def _systemd_log_prefix(_, level, log): + SYSLOG_MAP = { + "critical": 2, + "error": 3, + "exception": 3, + "warn": 3, + "warning": 3, + "info": 6, + "debug": 7, + "notset": 7, + } + prefix = SYSLOG_MAP.get(level) + if prefix is not None: + log = f"<{prefix}>{log}" + + return log + + def configure_logging(args): + processors = [ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.dev.set_exc_info, + structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True), + structlog.processors.CallsiteParameterAdder( + [ + structlog.processors.CallsiteParameter.FILENAME, + structlog.processors.CallsiteParameter.FUNC_NAME, + structlog.processors.CallsiteParameter.LINENO, + ], + ), + decorate_logger_name, + structlog.dev.ConsoleRenderer(), + ] + + if args.syslogd_log_prefix: + processors.append(_systemd_log_prefix) + structlog.configure( - processors=[ - structlog.contextvars.merge_contextvars, - structlog.processors.add_log_level, - structlog.processors.StackInfoRenderer(), - structlog.dev.set_exc_info, - structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True), - structlog.processors.CallsiteParameterAdder( - [ - structlog.processors.CallsiteParameter.FILENAME, - structlog.processors.CallsiteParameter.FUNC_NAME, - structlog.processors.CallsiteParameter.LINENO, - ], - ), - decorate_logger_name, - structlog.dev.ConsoleRenderer(), - ], + processors=processors, wrapper_class=structlog.make_filtering_bound_logger(args.log_level), context_class=dict, logger_factory=structlog.PrintLoggerFactory(), diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 744a09a..c187c42 100755 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -29,7 +29,9 @@ import json import threading import socket -args = argparse.Namespace() +arg_parser = argparse.ArgumentParser() +brozzler.cli.add_common_options(arg_parser) +args = arg_parser.parse_args([]) args.log_level = logging.INFO brozzler.cli.configure_logging(args) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 760962d..31628c4 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -28,7 +28,9 @@ import pytest import brozzler -args = argparse.Namespace() +arg_parser = argparse.ArgumentParser() +brozzler.cli.add_common_options(arg_parser) +args = arg_parser.parse_args([]) args.log_level = logging.INFO brozzler.cli.configure_logging(args) From b45e5dc0960847ddd79d8462b6fe037ae6ecef27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Mon, 24 Feb 2025 14:00:16 -0800 Subject: [PATCH 4/4] CLI: add new --worker-id option This adds a new commandline flag allowing the worker ID to be specified. If present, it will be added to the global context so that it will be included in every logging statement. Previously, we only had some indirect values to tie logging statements to specific workers, so this should make it easier to follow. --- brozzler/cli.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/brozzler/cli.py b/brozzler/cli.py index bf1323d..751f685 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -80,6 +80,11 @@ def add_common_options(arg_parser, argv=None): action="store_true", help="add syslogd log level prefix for journalctl filtering", ) + arg_parser.add_argument( + "--worker-id", + dest="worker_id", + help="ID for this worker, displayed in logs if provided", + ) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) @@ -185,6 +190,10 @@ def configure_logging(args): cache_logger_on_first_use=False, ) + # Adds the worker ID to the global binding, if supplied + if args.worker_id is not None: + structlog.contextvars.bind_contextvars(worker_id=args.worker_id) + # We still configure logging for now because its handlers # are used for the gunicorn spawned by the brozzler dashboard. logging.basicConfig(