From c59b08df33d5dcfa828ca48a614b27703425a020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Tue, 4 Mar 2025 09:34:23 -0800 Subject: [PATCH 1/9] test: add CI (#329) This adds two CI runs: a quick one that happens for every pull request and merge to master, and a longer one that happens daily. This also adds a new installation group to setup.py because the `easy` group isn't currently installable, and some of the dependencies specified there need to be present for the tests to run. --- .github/workflows/daily.yaml | 22 +++++++++++++++++++++ .github/workflows/setup/action.yml | 27 ++++++++++++++++++++++++++ .github/workflows/tests.yml | 31 ++++++++++++++++++++++++++++++ brozzler/__init__.py | 4 ++-- brozzler/worker.py | 2 +- setup.py | 3 +++ tests/test_cli.py | 18 +++++++++++++---- tests/test_units.py | 6 ++++++ 8 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/daily.yaml create mode 100644 .github/workflows/setup/action.yml create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml new file mode 100644 index 0000000..bc79677 --- /dev/null +++ b/.github/workflows/daily.yaml @@ -0,0 +1,22 @@ +name: Full test suite + +on: + schedule: + - cron: "0 6 * * *" # 10PM Pacific daily + +jobs: + test: + name: Run tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - uses: ./.github/workflows/setup + + - name: Run tests + run: | + py.test --tb=native --verbose tests diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml new file mode 100644 index 0000000..8367704 --- /dev/null +++ b/.github/workflows/setup/action.yml @@ -0,0 +1,27 @@ +name: Test setup + +runs: + using: composite + steps: + - name: Install apt dependencies + run: | + sudo apt-get update + sudo apt-get install libjpeg-dev chromium-browser + shell: bash + + - name: Set up rethinkdb + run: | + wget -qO- https://download.rethinkdb.com/repository/raw/pubkey.gpg | sudo gpg --dearmor -o /usr/share/keyrings/rethinkdb-archive-keyrings.gpg + echo "deb [signed-by=/usr/share/keyrings/rethinkdb-archive-keyrings.gpg] https://download.rethinkdb.com/repository/ubuntu-$(lsb_release -cs) $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list + sudo apt-get update + sudo apt-get install rethinkdb + sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf + sudo /etc/init.d/rethinkdb restart + shell: bash + + - name: Install pip dependencies + run: | + pip install .[rethinkdb,warcprox,yt-dlp] + # setuptools required by rethinkdb==2.4.9 + pip install pytest setuptools + shell: bash diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..fdafbd5 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,31 @@ +name: Tests + +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +jobs: + test: + name: Run tests + runs-on: ubuntu-latest + strategy: + matrix: + version: ['3.8', '3.12'] + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.version }} + + - uses: ./.github/workflows/setup + + - name: Run tests + run: | + py.test --tb=native --verbose tests/test_cli.py tests/test_units.py diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 7798e5f..7df8671 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -19,9 +19,9 @@ limitations under the License. import logging import structlog -from pkg_resources import get_distribution as _get_distribution +from importlib.metadata import version as _version -__version__ = _get_distribution("brozzler").version +__version__ = _version("brozzler") class ShutdownRequested(Exception): diff --git a/brozzler/worker.py b/brozzler/worker.py index 98a9daa..7a20658 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -581,7 +581,7 @@ class BrozzlerWorker: if page: # Calculate backoff in seconds based on number of failed attempts. # Minimum of 60, max of 135 giving delays of 60, 90, 135, 135... - retry_delay = min(135, 60 * (1.5**page.failed_attempts)) + retry_delay = min(135, 60 * (1.5 ** (page.failed_attempts or 0))) page.retry_after = doublethink.utcnow() + datetime.timedelta( seconds=retry_delay ) diff --git a/setup.py b/setup.py index 21c8b8f..8751e08 100644 --- a/setup.py +++ b/setup.py @@ -81,6 +81,9 @@ setuptools.setup( extras_require={ "yt-dlp": ["yt-dlp>=2024.7.25"], "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], + "warcprox": [ + "warcprox>=2.4.31", + ], "rethinkdb": [ "rethinkdb==2.4.9", "doublethink==0.4.9", diff --git a/tests/test_cli.py b/tests/test_cli.py index 567260f..076a466 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,14 +18,24 @@ limitations under the License. """ import brozzler.cli -import pkg_resources +import importlib.metadata import pytest import subprocess import doublethink +def console_scripts(): + # We do a dict comprehension here because the select filters aren't + # available until Python 3.10's importlib. + return { + ep.name: ep + for ep in importlib.metadata.distribution("brozzler").entry_points + if ep.group == "console_scripts" + } + + def cli_commands(): - commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys()) + commands = set(console_scripts().keys()) commands.remove("brozzler-wayback") try: import gunicorn @@ -40,8 +50,8 @@ def cli_commands(): @pytest.mark.parametrize("cmd", cli_commands()) def test_call_entrypoint(capsys, cmd): - entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd] - callable = entrypoint.resolve() + entrypoint = console_scripts()[cmd] + callable = entrypoint.load() with pytest.raises(SystemExit): callable(["/whatever/bin/%s" % cmd, "--version"]) out, err = capsys.readouterr() diff --git a/tests/test_units.py b/tests/test_units.py index 27a89a7..096f632 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -260,6 +260,9 @@ blocks: ) +# Some changes to the brozzler ydl interface not represented in this test +# https://github.com/internetarchive/brozzler/issues/330 +@pytest.mark.xfail def test_proxy_down(): """ Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. @@ -471,6 +474,9 @@ def test_thread_raise_second_with_block(): assert isinstance(thread_caught_exception, Exception2) +# brozzler.ydl.YoutubeDLSpy is missing +# https://github.com/internetarchive/brozzler/issues/330 +@pytest.mark.xfail def test_needs_browsing(): # only one test case here right now, which exposed a bug From ffeaee7a014c1685d971e74b99980093d91bafc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Tue, 25 Feb 2025 14:18:54 -0800 Subject: [PATCH 2/9] chore: use ruff for formatting There are a few minor changes here compared to black; it flagged unnecessary string concatenations, and has slightly different opinions on line length. --- .github/workflows/python-formatting.yml | 4 ++-- Makefile | 4 ++-- brozzler/chrome.py | 2 +- brozzler/cli.py | 9 ++++----- brozzler/dashboard/__init__.py | 2 +- brozzler/easy.py | 7 +++---- brozzler/pywb.py | 4 +--- brozzler/robots.py | 2 +- brozzler/worker.py | 4 ++-- brozzler/ydl.py | 2 +- vagrant/vagrant-brozzler-new-site.py | 2 +- 11 files changed, 19 insertions(+), 23 deletions(-) diff --git a/.github/workflows/python-formatting.yml b/.github/workflows/python-formatting.yml index afbdff5..1a94232 100644 --- a/.github/workflows/python-formatting.yml +++ b/.github/workflows/python-formatting.yml @@ -22,10 +22,10 @@ jobs: - name: Create virtual environment run: python -m venv venv - - name: Install black + - name: Install ruff run: | ./venv/bin/pip install --upgrade pip - ./venv/bin/pip install black + ./venv/bin/pip install ruff - name: Run formatting check run: make ck-format diff --git a/Makefile b/Makefile index f99dcc9..ae16af0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: format format: - venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 . + venv/bin/ruff format --target-version py37 . .PHONY: ck-format ck-format: - venv/bin/black --check . + venv/bin/ruff format --check --target-version py37 . diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 140ffd7..1b630cc 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -367,7 +367,7 @@ class Chrome: os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() pid_logger.warning( - "chrome reaped after killing with " "SIGKILL", + "chrome reaped after killing with SIGKILL", status=status, ) diff --git a/brozzler/cli.py b/brozzler/cli.py index e49c5aa..73c002d 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -953,7 +953,7 @@ def brozzler_list_pages(argv=None): "--claimed", dest="claimed", action="store_true", - help=("limit to pages that are currently claimed by a brozzler " "worker"), + help=("limit to pages that are currently claimed by a brozzler worker"), ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -1024,22 +1024,21 @@ def brozzler_purge(argv=None): dest="job", metavar="JOB_ID", help=( - "purge crawl state from rethinkdb for a job, including all " - "sites and pages" + "purge crawl state from rethinkdb for a job, including all sites and pages" ), ) group.add_argument( "--site", dest="site", metavar="SITE_ID", - help=("purge crawl state from rethinkdb for a site, including all " "pages"), + help=("purge crawl state from rethinkdb for a site, including all pages"), ) group.add_argument( "--finished-before", dest="finished_before", metavar="YYYY-MM-DD", help=( - "purge crawl state from rethinkdb for a jobs that ended " "before this date" + "purge crawl state from rethinkdb for a jobs that ended before this date" ), ) arg_parser.add_argument( diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index fcac6c5..e623a01 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -334,7 +334,7 @@ def main(argv=None): prog=os.path.basename(argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=( - "brozzler-dashboard - web application for viewing brozzler " "crawl status" + "brozzler-dashboard - web application for viewing brozzler crawl status" ), epilog=( "brozzler-dashboard has no command line options, but can be " diff --git a/brozzler/easy.py b/brozzler/easy.py index 28a6c14..05f01a0 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -81,8 +81,7 @@ def _build_arg_parser(argv=None): dest="cacert", default="./%s-warcprox-ca.pem" % socket.gethostname(), help=( - "warcprox CA certificate file; if file does not exist, it " - "will be created" + "warcprox CA certificate file; if file does not exist, it will be created" ), ) arg_parser.add_argument( @@ -95,7 +94,7 @@ def _build_arg_parser(argv=None): "--onion-tor-socks-proxy", dest="onion_tor_socks_proxy", default=None, - help=("host:port of tor socks proxy, used only to connect to " ".onion sites"), + help=("host:port of tor socks proxy, used only to connect to .onion sites"), ) # brozzler-worker args @@ -112,7 +111,7 @@ def _build_arg_parser(argv=None): dest="max_browsers", type=int, default=1, - help=("max number of chrome instances simultaneously " "browsing pages"), + help=("max number of chrome instances simultaneously browsing pages"), ) # pywb args diff --git a/brozzler/pywb.py b/brozzler/pywb.py index de0fa44..5d714d0 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -447,8 +447,6 @@ def main(argv=sys.argv): wayback_cli = BrozzlerWaybackCli( args=argv[1:], default_port=8880, - desc=( - "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)" - ), + desc=("brozzler-wayback - pywb wayback (monkey-patched for use with brozzler)"), ) wayback_cli.run() diff --git a/brozzler/robots.py b/brozzler/robots.py index 0c9d699..2d3751e 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -120,7 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None): raise brozzler.ProxyError(e) else: structlog.get_logger(logger_name=__name__).warning( - "returning true (permitted) after problem fetching " "robots.txt", + "returning true (permitted) after problem fetching robots.txt", url=url, raised_exception=e, ) diff --git a/brozzler/worker.py b/brozzler/worker.py index 7a20658..b8befef 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -169,7 +169,7 @@ class BrozzlerWorker: svc = self._choose_warcprox() if svc is None: raise brozzler.ProxyError( - "no available instances of warcprox in the service " "registry" + "no available instances of warcprox in the service registry" ) site.proxy = "%s:%s" % (svc["host"], svc["port"]) site.save() @@ -735,7 +735,7 @@ class BrozzlerWorker: with self._start_stop_lock: if self._thread: self.logger.warning( - "ignoring start request because self._thread is " "not None" + "ignoring start request because self._thread is not None" ) return self._thread = threading.Thread(target=self.run, name="BrozzlerWorker") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index de88dda..7fab1f7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -434,7 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page): if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logger.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json", + "sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json", url=ydl.url, ) worker._warcprox_write_record( diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 244bf6b..5e7503a 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -82,7 +82,7 @@ def main(argv=[]): os.chdir(os.path.dirname(__file__)) cmd = ( - "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s" + "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site %s %s" ) % (" ".join(options), args.seed) subprocess.call(["vagrant", "ssh", "--", cmd]) From f384d0b8305867ff85017b4be040bb14eef7ed8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 5 Mar 2025 10:01:27 -0800 Subject: [PATCH 3/9] deps: add dev deps to pyproject.toml --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 9880da2..cefe99c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,3 +25,9 @@ Issues = "https://github.com/internetarchive/brozzler/issues" [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[dependency-groups] +dev = [ + "pytest>=8.3.5", + "ruff>=0.9.9" +] From af0f3ed378d91e8e9dcb04f2fa315e867b5d8017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Mon, 24 Feb 2025 14:53:13 -0800 Subject: [PATCH 4/9] CLI: enable log prefixing This adds a commandline option which enables log level prefixing. These prefixes enable log level-based filtering in journalctl when present so long as logs are going to the journal, and `SyslogLevelPrefix=` is set to `true` (which it is by default). For documentation: https://manpages.debian.org/testing/libsystemd-dev/sd-daemon.3.en.html --- brozzler/cli.py | 62 ++++++++++++++++++++++++++++++----------- tests/test_brozzling.py | 4 ++- tests/test_frontier.py | 4 ++- 3 files changed, 52 insertions(+), 18 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 73c002d..bf1323d 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -74,6 +74,12 @@ def add_common_options(arg_parser, argv=None): const=logging.DEBUG, help=("very verbose logging"), ) + arg_parser.add_argument( + "--syslogd-log-prefix", + dest="syslogd_log_prefix", + action="store_true", + help="add syslogd log level prefix for journalctl filtering", + ) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) @@ -131,24 +137,48 @@ def decorate_logger_name(a, b, event_dict): return event_dict +# https://manpages.debian.org/testing/libsystemd-dev/sd-daemon.3.en.html +def _systemd_log_prefix(_, level, log): + SYSLOG_MAP = { + "critical": 2, + "error": 3, + "exception": 3, + "warn": 3, + "warning": 3, + "info": 6, + "debug": 7, + "notset": 7, + } + prefix = SYSLOG_MAP.get(level) + if prefix is not None: + log = f"<{prefix}>{log}" + + return log + + def configure_logging(args): + processors = [ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.dev.set_exc_info, + structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True), + structlog.processors.CallsiteParameterAdder( + [ + structlog.processors.CallsiteParameter.FILENAME, + structlog.processors.CallsiteParameter.FUNC_NAME, + structlog.processors.CallsiteParameter.LINENO, + ], + ), + decorate_logger_name, + structlog.dev.ConsoleRenderer(), + ] + + if args.syslogd_log_prefix: + processors.append(_systemd_log_prefix) + structlog.configure( - processors=[ - structlog.contextvars.merge_contextvars, - structlog.processors.add_log_level, - structlog.processors.StackInfoRenderer(), - structlog.dev.set_exc_info, - structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True), - structlog.processors.CallsiteParameterAdder( - [ - structlog.processors.CallsiteParameter.FILENAME, - structlog.processors.CallsiteParameter.FUNC_NAME, - structlog.processors.CallsiteParameter.LINENO, - ], - ), - decorate_logger_name, - structlog.dev.ConsoleRenderer(), - ], + processors=processors, wrapper_class=structlog.make_filtering_bound_logger(args.log_level), context_class=dict, logger_factory=structlog.PrintLoggerFactory(), diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 744a09a..c187c42 100755 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -29,7 +29,9 @@ import json import threading import socket -args = argparse.Namespace() +arg_parser = argparse.ArgumentParser() +brozzler.cli.add_common_options(arg_parser) +args = arg_parser.parse_args([]) args.log_level = logging.INFO brozzler.cli.configure_logging(args) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 760962d..31628c4 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -28,7 +28,9 @@ import pytest import brozzler -args = argparse.Namespace() +arg_parser = argparse.ArgumentParser() +brozzler.cli.add_common_options(arg_parser) +args = arg_parser.parse_args([]) args.log_level = logging.INFO brozzler.cli.configure_logging(args) From b45e5dc0960847ddd79d8462b6fe037ae6ecef27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Mon, 24 Feb 2025 14:00:16 -0800 Subject: [PATCH 5/9] CLI: add new --worker-id option This adds a new commandline flag allowing the worker ID to be specified. If present, it will be added to the global context so that it will be included in every logging statement. Previously, we only had some indirect values to tie logging statements to specific workers, so this should make it easier to follow. --- brozzler/cli.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/brozzler/cli.py b/brozzler/cli.py index bf1323d..751f685 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -80,6 +80,11 @@ def add_common_options(arg_parser, argv=None): action="store_true", help="add syslogd log level prefix for journalctl filtering", ) + arg_parser.add_argument( + "--worker-id", + dest="worker_id", + help="ID for this worker, displayed in logs if provided", + ) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) @@ -185,6 +190,10 @@ def configure_logging(args): cache_logger_on_first_use=False, ) + # Adds the worker ID to the global binding, if supplied + if args.worker_id is not None: + structlog.contextvars.bind_contextvars(worker_id=args.worker_id) + # We still configure logging for now because its handlers # are used for the gunicorn spawned by the brozzler dashboard. logging.basicConfig( From 7b896be372e9c86928a52c22bce9b248ef026fc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Tue, 4 Mar 2025 13:18:42 -0800 Subject: [PATCH 6/9] __init__.py: rework imports Although doublethink is an optional dependency to allow brozzler to be used as a library without it, in practice we had some mandatory import statements that prevented brozzler from being imported without it. This fixes that by gating off some of the imports and exports. If doublethink is available, brozzler works as it is now. But if it isn't, we make a few changes: * brozzler.worker, brozzler.cli and brozzler.model reexports are disabled * One brozzler.cli function, which is used outside brozzler's own cli, has been moved into brozzler's __init__.py. For compatibility, it's reexported from brozzler.cli. --- brozzler/__init__.py | 136 ++++++++++++++++++++++++++++++++++--------- brozzler/cli.py | 66 +-------------------- 2 files changed, 112 insertions(+), 90 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 7df8671..eafadbe 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -321,44 +321,128 @@ def _remove_query(url): # XXX chop off path after last slash?? site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query]) -import doublethink -import datetime -EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC) +def _mdfind(identifier): + import subprocess + + try: + result = subprocess.check_output( + ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True + ) + # Just treat any errors as "couldn't find app" + except subprocess.CalledProcessError: + return None + + if result: + return result.rstrip("\n") + + +def _suggest_default_chrome_exe_mac(): + import os + + path = None + # Try Chromium first, then Chrome + result = _mdfind("org.chromium.Chromium") + if result is not None: + path = f"{result}/Contents/MacOS/Chromium" + + result = _mdfind("com.google.Chrome") + if result is not None: + path = f"{result}/Contents/MacOS/Google Chrome" + + if path is not None and os.path.exists(path): + return path + + # Fall back to default paths if mdfind couldn't find it + # (mdfind might fail to find them even in their default paths + # if the system has Spotlight disabled.) + for path in [ + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + ]: + if os.path.exists(path): + return path + + +def suggest_default_chrome_exe(): + import shutil, sys + + # First ask mdfind, which lets us find it in non-default paths + if sys.platform == "darwin": + path = _suggest_default_chrome_exe_mac() + if path is not None: + return path + + # "chromium-browser" is the executable on ubuntu trusty + # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium" + # google chrome executable names taken from these packages: + # http://www.ubuntuupdates.org/ppa/google_chrome + for exe in [ + "chromium-browser", + "chromium", + "google-chrome", + "google-chrome-stable", + "google-chrome-beta", + "google-chrome-unstable", + ]: + if shutil.which(exe): + return exe + return "chromium-browser" -# we could make this configurable if there's a good reason -MAX_PAGE_FAILURES = 3 -from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots -from brozzler.frontier import RethinkDbFrontier from brozzler.browser import Browser, BrowserPool, BrowsingException -from brozzler.model import ( - new_job, - new_job_file, - new_site, - Job, - Page, - Site, - InvalidJobConf, -) -from brozzler.cli import suggest_default_chrome_exe __all__ = [ - "Page", - "Site", - "BrozzlerWorker", "is_permitted_by_robots", - "RethinkDbFrontier", "Browser", "BrowserPool", "BrowsingException", - "new_job", - "new_site", - "Job", - "new_job_file", - "InvalidJobConf", "sleep", "thread_accept_exceptions", "thread_raise", + "suggest_default_chrome_exe", ] + +import datetime + +try: + import doublethink + + # Synchronize epoch with doublethink if available + EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC) + + # All of these imports use doublethink for real and are unsafe + # to do if doublethink is unavailable. + from brozzler.worker import BrozzlerWorker + from brozzler.frontier import RethinkDbFrontier + from brozzler.model import ( + new_job, + new_job_file, + new_site, + Job, + Page, + Site, + InvalidJobConf, + ) + + __all__.extend( + [ + "Page", + "BrozzlerWorker", + "RethinkDbFrontier", + "Site", + "new_job", + "new_site", + "Job", + "new_job_file", + "InvalidJobConf", + ] + ) +except ImportError: + EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( + tzinfo=datetime.timezone.utc + ) + +# we could make this configurable if there's a good reason +MAX_PAGE_FAILURES = 3 diff --git a/brozzler/cli.py b/brozzler/cli.py index 751f685..27f8dcf 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -30,17 +30,17 @@ import doublethink import signal import string import structlog -import subprocess import sys import threading import time import traceback import warnings import yaml -import shutil import base64 import rethinkdb as rdb +from brozzler import suggest_default_chrome_exe + r = rdb.RethinkDB() logger = structlog.get_logger(logger_name=__name__) @@ -213,68 +213,6 @@ def configure_logging(args): ) -def mdfind(identifier): - try: - result = subprocess.check_output( - ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True - ) - # Just treat any errors as "couldn't find app" - except subprocess.CalledProcessError: - return None - - if result: - return result.rstrip("\n") - - -def suggest_default_chrome_exe_mac(): - path = None - # Try Chromium first, then Chrome - result = mdfind("org.chromium.Chromium") - if result is not None: - path = f"{result}/Contents/MacOS/Chromium" - - result = mdfind("com.google.Chrome") - if result is not None: - path = f"{result}/Contents/MacOS/Google Chrome" - - if path is not None and os.path.exists(path): - return path - - # Fall back to default paths if mdfind couldn't find it - # (mdfind might fail to find them even in their default paths - # if the system has Spotlight disabled.) - for path in [ - "/Applications/Chromium.app/Contents/MacOS/Chromium", - "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - ]: - if os.path.exists(path): - return path - - -def suggest_default_chrome_exe(): - # First ask mdfind, which lets us find it in non-default paths - if sys.platform == "darwin": - path = suggest_default_chrome_exe_mac() - if path is not None: - return path - - # "chromium-browser" is the executable on ubuntu trusty - # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium" - # google chrome executable names taken from these packages: - # http://www.ubuntuupdates.org/ppa/google_chrome - for exe in [ - "chromium-browser", - "chromium", - "google-chrome", - "google-chrome-stable", - "google-chrome-beta", - "google-chrome-unstable", - ]: - if shutil.which(exe): - return exe - return "chromium-browser" - - class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): """ Like argparse.ArgumentDefaultsHelpFormatter but omits the default value From 01e19fdf70de56a7a19bc257fef1490044b41da7 Mon Sep 17 00:00:00 2001 From: Alex Dempsey Date: Tue, 4 Mar 2025 13:51:07 -0800 Subject: [PATCH 7/9] Make tz-aware datetime of the epoch with stdlib --- brozzler/__init__.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index eafadbe..068b904 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -390,6 +390,11 @@ def suggest_default_chrome_exe(): return "chromium-browser" +import datetime + +EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc) + + from brozzler.robots import is_permitted_by_robots from brozzler.browser import Browser, BrowserPool, BrowsingException @@ -404,14 +409,9 @@ __all__ = [ "suggest_default_chrome_exe", ] -import datetime - try: import doublethink - # Synchronize epoch with doublethink if available - EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC) - # All of these imports use doublethink for real and are unsafe # to do if doublethink is unavailable. from brozzler.worker import BrozzlerWorker @@ -440,9 +440,7 @@ try: ] ) except ImportError: - EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( - tzinfo=datetime.timezone.utc - ) + pass # we could make this configurable if there's a good reason MAX_PAGE_FAILURES = 3 From 72e549694c154c6666b12cbecc1cf084a01c5315 Mon Sep 17 00:00:00 2001 From: Alex Dempsey Date: Tue, 4 Mar 2025 13:58:52 -0800 Subject: [PATCH 8/9] Only import yt-dlp if we're using it --- brozzler/worker.py | 39 ++++++++++++++++++++++++++++++++++++--- brozzler/ydl.py | 33 --------------------------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index b8befef..5500e4a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,7 +39,6 @@ import urlcanon from requests.structures import CaseInsensitiveDict import rethinkdb as rdb from . import metrics -from . import ydl r = rdb.RethinkDB() @@ -260,6 +259,38 @@ class BrozzlerWorker: img.save(out, "jpeg", quality=95) return out.getbuffer() + def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds): + # called only after we've passed needs_browsing() check + + if page_status != 200: + logger.info("skipping ytdlp: non-200 page status", page_status=page_status) + return False + if site.skip_ytdlp: + logger.info("skipping ytdlp: site marked skip_ytdlp") + return False + + ytdlp_url = page.redirect_url if page.redirect_url else page.url + + if "chrome-error:" in ytdlp_url: + return False + + ytdlp_seed = ( + site["metadata"]["ait_seed_id"] + if "metadata" in site and "ait_seed_id" in site["metadata"] + else None + ) + + # TODO: develop UI and refactor + if ytdlp_seed: + if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: + logger.info("skipping ytdlp: site in skip_av_seeds") + site.skip_ytdlp = True + return False + else: + site.skip_ytdlp = False + + return True + @metrics.brozzler_page_processing_duration_seconds.time() @metrics.brozzler_in_progress_pages.track_inprogress() def brozzle_page( @@ -293,10 +324,12 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: page_logger.info("page interstitial shown (http auth)") - if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._skip_av_seeds + if enable_youtube_dl and self.should_ytdlp( + page_logger, site, page, status_code, self._skip_av_seeds ): try: + from . import ydl + ydl_outlinks = ydl.do_youtube_dl( self, site, page, self._ytdlp_proxy_endpoints ) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7fab1f7..ae756a0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -43,39 +43,6 @@ YTDLP_MAX_REDIRECTS = 5 logger = structlog.get_logger(logger_name=__name__) -def should_ytdlp(site, page, page_status, skip_av_seeds): - # called only after we've passed needs_browsing() check - - if page_status != 200: - logger.info("skipping ytdlp: non-200 page status", page_status=page_status) - return False - if site.skip_ytdlp: - logger.info("skipping ytdlp: site marked skip_ytdlp") - return False - - ytdlp_url = page.redirect_url if page.redirect_url else page.url - - if "chrome-error:" in ytdlp_url: - return False - - ytdlp_seed = ( - site["metadata"]["ait_seed_id"] - if "metadata" in site and "ait_seed_id" in site["metadata"] - else None - ) - - # TODO: develop UI and refactor - if ytdlp_seed: - if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: - logger.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - else: - site.skip_ytdlp = False - - return True - - def isyoutubehost(url): # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] From c5aa46174dd7902bac859ec2defe613307d4a2b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Tue, 4 Mar 2025 14:32:32 -0800 Subject: [PATCH 9/9] ydl: never try if extra missing --- brozzler/worker.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index 5500e4a..868ded4 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -95,6 +95,16 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + + # We definitely shouldn't ytdlp if the optional extra is missing + try: + import yt_dlp + except ImportError: + self.logger.info( + "optional yt-dlp extra not installed; setting skip_youtube_dl to True" + ) + self._skip_youtube_dl = True + self._ytdlp_tmpdir = ytdlp_tmpdir self._simpler404 = simpler404 self._screenshot_full_page = screenshot_full_page