diff --git a/.github/workflows/python-formatting.yml b/.github/workflows/python-formatting.yml index afbdff5..1a94232 100644 --- a/.github/workflows/python-formatting.yml +++ b/.github/workflows/python-formatting.yml @@ -22,10 +22,10 @@ jobs: - name: Create virtual environment run: python -m venv venv - - name: Install black + - name: Install ruff run: | ./venv/bin/pip install --upgrade pip - ./venv/bin/pip install black + ./venv/bin/pip install ruff - name: Run formatting check run: make ck-format diff --git a/Makefile b/Makefile index f99dcc9..ae16af0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: format format: - venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 . + venv/bin/ruff format --target-version py37 . .PHONY: ck-format ck-format: - venv/bin/black --check . + venv/bin/ruff format --check --target-version py37 . diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 140ffd7..1b630cc 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -367,7 +367,7 @@ class Chrome: os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() pid_logger.warning( - "chrome reaped after killing with " "SIGKILL", + "chrome reaped after killing with SIGKILL", status=status, ) diff --git a/brozzler/cli.py b/brozzler/cli.py index e49c5aa..751f685 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -74,6 +74,17 @@ def add_common_options(arg_parser, argv=None): const=logging.DEBUG, help=("very verbose logging"), ) + arg_parser.add_argument( + "--syslogd-log-prefix", + dest="syslogd_log_prefix", + action="store_true", + help="add syslogd log level prefix for journalctl filtering", + ) + arg_parser.add_argument( + "--worker-id", + dest="worker_id", + help="ID for this worker, displayed in logs if provided", + ) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) @@ -131,30 +142,58 @@ def decorate_logger_name(a, b, event_dict): return event_dict +# https://manpages.debian.org/testing/libsystemd-dev/sd-daemon.3.en.html +def _systemd_log_prefix(_, level, log): + SYSLOG_MAP = { + "critical": 2, + "error": 3, + "exception": 3, + "warn": 3, + "warning": 3, + "info": 6, + "debug": 7, + "notset": 7, + } + prefix = SYSLOG_MAP.get(level) + if prefix is not None: + log = f"<{prefix}>{log}" + + return log + + def configure_logging(args): + processors = [ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.dev.set_exc_info, + structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True), + structlog.processors.CallsiteParameterAdder( + [ + structlog.processors.CallsiteParameter.FILENAME, + structlog.processors.CallsiteParameter.FUNC_NAME, + structlog.processors.CallsiteParameter.LINENO, + ], + ), + decorate_logger_name, + structlog.dev.ConsoleRenderer(), + ] + + if args.syslogd_log_prefix: + processors.append(_systemd_log_prefix) + structlog.configure( - processors=[ - structlog.contextvars.merge_contextvars, - structlog.processors.add_log_level, - structlog.processors.StackInfoRenderer(), - structlog.dev.set_exc_info, - structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True), - structlog.processors.CallsiteParameterAdder( - [ - structlog.processors.CallsiteParameter.FILENAME, - structlog.processors.CallsiteParameter.FUNC_NAME, - structlog.processors.CallsiteParameter.LINENO, - ], - ), - decorate_logger_name, - structlog.dev.ConsoleRenderer(), - ], + processors=processors, wrapper_class=structlog.make_filtering_bound_logger(args.log_level), context_class=dict, logger_factory=structlog.PrintLoggerFactory(), cache_logger_on_first_use=False, ) + # Adds the worker ID to the global binding, if supplied + if args.worker_id is not None: + structlog.contextvars.bind_contextvars(worker_id=args.worker_id) + # We still configure logging for now because its handlers # are used for the gunicorn spawned by the brozzler dashboard. logging.basicConfig( @@ -953,7 +992,7 @@ def brozzler_list_pages(argv=None): "--claimed", dest="claimed", action="store_true", - help=("limit to pages that are currently claimed by a brozzler " "worker"), + help=("limit to pages that are currently claimed by a brozzler worker"), ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -1024,22 +1063,21 @@ def brozzler_purge(argv=None): dest="job", metavar="JOB_ID", help=( - "purge crawl state from rethinkdb for a job, including all " - "sites and pages" + "purge crawl state from rethinkdb for a job, including all sites and pages" ), ) group.add_argument( "--site", dest="site", metavar="SITE_ID", - help=("purge crawl state from rethinkdb for a site, including all " "pages"), + help=("purge crawl state from rethinkdb for a site, including all pages"), ) group.add_argument( "--finished-before", dest="finished_before", metavar="YYYY-MM-DD", help=( - "purge crawl state from rethinkdb for a jobs that ended " "before this date" + "purge crawl state from rethinkdb for a jobs that ended before this date" ), ) arg_parser.add_argument( diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index fcac6c5..e623a01 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -334,7 +334,7 @@ def main(argv=None): prog=os.path.basename(argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=( - "brozzler-dashboard - web application for viewing brozzler " "crawl status" + "brozzler-dashboard - web application for viewing brozzler crawl status" ), epilog=( "brozzler-dashboard has no command line options, but can be " diff --git a/brozzler/easy.py b/brozzler/easy.py index 28a6c14..05f01a0 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -81,8 +81,7 @@ def _build_arg_parser(argv=None): dest="cacert", default="./%s-warcprox-ca.pem" % socket.gethostname(), help=( - "warcprox CA certificate file; if file does not exist, it " - "will be created" + "warcprox CA certificate file; if file does not exist, it will be created" ), ) arg_parser.add_argument( @@ -95,7 +94,7 @@ def _build_arg_parser(argv=None): "--onion-tor-socks-proxy", dest="onion_tor_socks_proxy", default=None, - help=("host:port of tor socks proxy, used only to connect to " ".onion sites"), + help=("host:port of tor socks proxy, used only to connect to .onion sites"), ) # brozzler-worker args @@ -112,7 +111,7 @@ def _build_arg_parser(argv=None): dest="max_browsers", type=int, default=1, - help=("max number of chrome instances simultaneously " "browsing pages"), + help=("max number of chrome instances simultaneously browsing pages"), ) # pywb args diff --git a/brozzler/pywb.py b/brozzler/pywb.py index de0fa44..5d714d0 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -447,8 +447,6 @@ def main(argv=sys.argv): wayback_cli = BrozzlerWaybackCli( args=argv[1:], default_port=8880, - desc=( - "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)" - ), + desc=("brozzler-wayback - pywb wayback (monkey-patched for use with brozzler)"), ) wayback_cli.run() diff --git a/brozzler/robots.py b/brozzler/robots.py index 0c9d699..2d3751e 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -120,7 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None): raise brozzler.ProxyError(e) else: structlog.get_logger(logger_name=__name__).warning( - "returning true (permitted) after problem fetching " "robots.txt", + "returning true (permitted) after problem fetching robots.txt", url=url, raised_exception=e, ) diff --git a/brozzler/worker.py b/brozzler/worker.py index 7a20658..b8befef 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -169,7 +169,7 @@ class BrozzlerWorker: svc = self._choose_warcprox() if svc is None: raise brozzler.ProxyError( - "no available instances of warcprox in the service " "registry" + "no available instances of warcprox in the service registry" ) site.proxy = "%s:%s" % (svc["host"], svc["port"]) site.save() @@ -735,7 +735,7 @@ class BrozzlerWorker: with self._start_stop_lock: if self._thread: self.logger.warning( - "ignoring start request because self._thread is " "not None" + "ignoring start request because self._thread is not None" ) return self._thread = threading.Thread(target=self.run, name="BrozzlerWorker") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index de88dda..7fab1f7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -434,7 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page): if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logger.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json", + "sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json", url=ydl.url, ) worker._warcprox_write_record( diff --git a/pyproject.toml b/pyproject.toml index 9880da2..cefe99c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,3 +25,9 @@ Issues = "https://github.com/internetarchive/brozzler/issues" [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[dependency-groups] +dev = [ + "pytest>=8.3.5", + "ruff>=0.9.9" +] diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 744a09a..c187c42 100755 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -29,7 +29,9 @@ import json import threading import socket -args = argparse.Namespace() +arg_parser = argparse.ArgumentParser() +brozzler.cli.add_common_options(arg_parser) +args = arg_parser.parse_args([]) args.log_level = logging.INFO brozzler.cli.configure_logging(args) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 760962d..31628c4 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -28,7 +28,9 @@ import pytest import brozzler -args = argparse.Namespace() +arg_parser = argparse.ArgumentParser() +brozzler.cli.add_common_options(arg_parser) +args = arg_parser.parse_args([]) args.log_level = logging.INFO brozzler.cli.configure_logging(args) diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 244bf6b..5e7503a 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -82,7 +82,7 @@ def main(argv=[]): os.chdir(os.path.dirname(__file__)) cmd = ( - "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s" + "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site %s %s" ) % (" ".join(options), args.seed) subprocess.call(["vagrant", "ssh", "--", cmd])