__init__.py: rework imports (#334)

* __init__.py: rework imports Although doublethink is an optional dependency to allow brozzler to be used as a library without it, in practice we had some mandatory import statements that prevented brozzler from being imported without it. This fixes that by gating off some of the imports and exports. If doublethink is available, brozzler works as it is now. But if it isn't, we make a few changes: * brozzler.worker, brozzler.cli and brozzler.model reexports are disabled * One brozzler.cli function, which is used outside brozzler's own cli, has been moved into brozzler's __init__.py. For compatibility, it's reexported from brozzler.cli. * Make tz-aware datetime of the epoch with stdlib * Only import yt-dlp if we're using it * ydl: never try if extra missing * cli: use worker's yt-dlp check --------- Co-authored-by: Alex Dempsey <avdempsey@archive.org>
2025-07-30 10:08:44 -04:00 · 2025-03-06 14:49:22 -08:00 · 2025-03-06 14:49:22 -08:00 · 21102ca95c
commit 21102ca95c
parent 0f707dc02b
4 changed files with 156 additions and 126 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -321,44 +321,126 @@ def _remove_query(url):
 # XXX chop off path after last slash??
 site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])

-import doublethink
+
+def _mdfind(identifier):
+    import subprocess
+
+    try:
+        result = subprocess.check_output(
+            ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
+        )
+    # Just treat any errors as "couldn't find app"
+    except subprocess.CalledProcessError:
+        return None
+
+    if result:
+        return result.rstrip("\n")
+
+
+def _suggest_default_chrome_exe_mac():
+    import os
+
+    path = None
+    # Try Chromium first, then Chrome
+    result = _mdfind("org.chromium.Chromium")
+    if result is not None:
+        path = f"{result}/Contents/MacOS/Chromium"
+
+    result = _mdfind("com.google.Chrome")
+    if result is not None:
+        path = f"{result}/Contents/MacOS/Google Chrome"
+
+    if path is not None and os.path.exists(path):
+        return path
+
+    # Fall back to default paths if mdfind couldn't find it
+    # (mdfind might fail to find them even in their default paths
+    # if the system has Spotlight disabled.)
+    for path in [
+        "/Applications/Chromium.app/Contents/MacOS/Chromium",
+        "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+    ]:
+        if os.path.exists(path):
+            return path
+
+
+def suggest_default_chrome_exe():
+    import shutil, sys
+
+    # First ask mdfind, which lets us find it in non-default paths
+    if sys.platform == "darwin":
+        path = _suggest_default_chrome_exe_mac()
+        if path is not None:
+            return path
+
+    # "chromium-browser" is the executable on ubuntu trusty
+    # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
+    # google chrome executable names taken from these packages:
+    # http://www.ubuntuupdates.org/ppa/google_chrome
+    for exe in [
+        "chromium-browser",
+        "chromium",
+        "google-chrome",
+        "google-chrome-stable",
+        "google-chrome-beta",
+        "google-chrome-unstable",
+    ]:
+        if shutil.which(exe):
+            return exe
+    return "chromium-browser"
+
+
 import datetime

-EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
+EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)

-# we could make this configurable if there's a good reason
-MAX_PAGE_FAILURES = 3

-from brozzler.worker import BrozzlerWorker
 from brozzler.robots import is_permitted_by_robots
-from brozzler.frontier import RethinkDbFrontier
 from brozzler.browser import Browser, BrowserPool, BrowsingException
-from brozzler.model import (
-    new_job,
-    new_job_file,
-    new_site,
-    Job,
-    Page,
-    Site,
-    InvalidJobConf,
-)
-from brozzler.cli import suggest_default_chrome_exe

 __all__ = [
-    "Page",
-    "Site",
-    "BrozzlerWorker",
    "is_permitted_by_robots",
-    "RethinkDbFrontier",
    "Browser",
    "BrowserPool",
    "BrowsingException",
-    "new_job",
-    "new_site",
-    "Job",
-    "new_job_file",
-    "InvalidJobConf",
    "sleep",
    "thread_accept_exceptions",
    "thread_raise",
+    "suggest_default_chrome_exe",
 ]
+
+try:
+    import doublethink
+
+    # All of these imports use doublethink for real and are unsafe
+    # to do if doublethink is unavailable.
+    from brozzler.worker import BrozzlerWorker
+    from brozzler.frontier import RethinkDbFrontier
+    from brozzler.model import (
+        new_job,
+        new_job_file,
+        new_site,
+        Job,
+        Page,
+        Site,
+        InvalidJobConf,
+    )
+
+    __all__.extend(
+        [
+            "Page",
+            "BrozzlerWorker",
+            "RethinkDbFrontier",
+            "Site",
+            "new_job",
+            "new_site",
+            "Job",
+            "new_job_file",
+            "InvalidJobConf",
+        ]
+    )
+except ImportError:
+    pass
+
+# we could make this configurable if there's a good reason
+MAX_PAGE_FAILURES = 3
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -30,17 +30,17 @@ import doublethink
 import signal
 import string
 import structlog
-import subprocess
 import sys
 import threading
 import time
 import traceback
 import warnings
 import yaml
-import shutil
 import base64
 import rethinkdb as rdb

+from brozzler import suggest_default_chrome_exe
+
 r = rdb.RethinkDB()

 logger = structlog.get_logger(logger_name=__name__)
@ -213,68 +213,6 @@ def configure_logging(args):
    )


-def mdfind(identifier):
-    try:
-        result = subprocess.check_output(
-            ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
-        )
-    # Just treat any errors as "couldn't find app"
-    except subprocess.CalledProcessError:
-        return None
-
-    if result:
-        return result.rstrip("\n")
-
-
-def suggest_default_chrome_exe_mac():
-    path = None
-    # Try Chromium first, then Chrome
-    result = mdfind("org.chromium.Chromium")
-    if result is not None:
-        path = f"{result}/Contents/MacOS/Chromium"
-
-    result = mdfind("com.google.Chrome")
-    if result is not None:
-        path = f"{result}/Contents/MacOS/Google Chrome"
-
-    if path is not None and os.path.exists(path):
-        return path
-
-    # Fall back to default paths if mdfind couldn't find it
-    # (mdfind might fail to find them even in their default paths
-    # if the system has Spotlight disabled.)
-    for path in [
-        "/Applications/Chromium.app/Contents/MacOS/Chromium",
-        "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
-    ]:
-        if os.path.exists(path):
-            return path
-
-
-def suggest_default_chrome_exe():
-    # First ask mdfind, which lets us find it in non-default paths
-    if sys.platform == "darwin":
-        path = suggest_default_chrome_exe_mac()
-        if path is not None:
-            return path
-
-    # "chromium-browser" is the executable on ubuntu trusty
-    # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
-    # google chrome executable names taken from these packages:
-    # http://www.ubuntuupdates.org/ppa/google_chrome
-    for exe in [
-        "chromium-browser",
-        "chromium",
-        "google-chrome",
-        "google-chrome-stable",
-        "google-chrome-beta",
-        "google-chrome-unstable",
-    ]:
-        if shutil.which(exe):
-            return exe
-    return "chromium-browser"
-
-
 class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
    """
    Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
@ -455,7 +393,7 @@ def brozzle_page(argv=None):
            site,
            page,
            on_screenshot=on_screenshot,
-            enable_youtube_dl=not args.skip_youtube_dl,
+            enable_youtube_dl=not worker._skip_youtube_dl,
        )
        logger.info("outlinks", outlinks=sorted(outlinks))
    except brozzler.ReachedLimit as e:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -39,7 +39,6 @@ import urlcanon
 from requests.structures import CaseInsensitiveDict
 import rethinkdb as rdb
 from . import metrics
-from . import ydl

 r = rdb.RethinkDB()

@ -96,6 +95,16 @@ class BrozzlerWorker:
        self._skip_extract_outlinks = skip_extract_outlinks
        self._skip_visit_hashtags = skip_visit_hashtags
        self._skip_youtube_dl = skip_youtube_dl
+
+        # We definitely shouldn't ytdlp if the optional extra is missing
+        try:
+            import yt_dlp
+        except ImportError:
+            self.logger.info(
+                "optional yt-dlp extra not installed; setting skip_youtube_dl to True"
+            )
+            self._skip_youtube_dl = True
+
        self._ytdlp_tmpdir = ytdlp_tmpdir
        self._simpler404 = simpler404
        self._screenshot_full_page = screenshot_full_page
@ -260,6 +269,38 @@ class BrozzlerWorker:
        img.save(out, "jpeg", quality=95)
        return out.getbuffer()

+    def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
+        # called only after we've passed needs_browsing() check
+
+        if page_status != 200:
+            logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
+            return False
+        if site.skip_ytdlp:
+            logger.info("skipping ytdlp: site marked skip_ytdlp")
+            return False
+
+        ytdlp_url = page.redirect_url if page.redirect_url else page.url
+
+        if "chrome-error:" in ytdlp_url:
+            return False
+
+        ytdlp_seed = (
+            site["metadata"]["ait_seed_id"]
+            if "metadata" in site and "ait_seed_id" in site["metadata"]
+            else None
+        )
+
+        # TODO: develop UI and refactor
+        if ytdlp_seed:
+            if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
+                logger.info("skipping ytdlp: site in skip_av_seeds")
+                site.skip_ytdlp = True
+                return False
+            else:
+                site.skip_ytdlp = False
+
+        return True
+
    @metrics.brozzler_page_processing_duration_seconds.time()
    @metrics.brozzler_in_progress_pages.track_inprogress()
    def brozzle_page(
@ -293,10 +334,12 @@ class BrozzlerWorker:
            except brozzler.PageInterstitialShown:
                page_logger.info("page interstitial shown (http auth)")

-            if enable_youtube_dl and ydl.should_ytdlp(
-                site, page, status_code, self._skip_av_seeds
+            if enable_youtube_dl and self.should_ytdlp(
+                page_logger, site, page, status_code, self._skip_av_seeds
            ):
                try:
+                    from . import ydl
+
                    ydl_outlinks = ydl.do_youtube_dl(
                        self, site, page, self._ytdlp_proxy_endpoints
                    )
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -43,39 +43,6 @@ YTDLP_MAX_REDIRECTS = 5
 logger = structlog.get_logger(logger_name=__name__)


-def should_ytdlp(site, page, page_status, skip_av_seeds):
-    # called only after we've passed needs_browsing() check
-
-    if page_status != 200:
-        logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
-        return False
-    if site.skip_ytdlp:
-        logger.info("skipping ytdlp: site marked skip_ytdlp")
-        return False
-
-    ytdlp_url = page.redirect_url if page.redirect_url else page.url
-
-    if "chrome-error:" in ytdlp_url:
-        return False
-
-    ytdlp_seed = (
-        site["metadata"]["ait_seed_id"]
-        if "metadata" in site and "ait_seed_id" in site["metadata"]
-        else None
-    )
-
-    # TODO: develop UI and refactor
-    if ytdlp_seed:
-        if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
-            logger.info("skipping ytdlp: site in skip_av_seeds")
-            site.skip_ytdlp = True
-            return False
-        else:
-            site.skip_ytdlp = False
-
-    return True
-
-
 def isyoutubehost(url):
    # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname
    return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]