Merge branch 'doublethink-optional' into qa

2025-04-20 15:55:49 -04:00 · 2025-03-05 11:18:58 -08:00 · 2025-03-05 11:18:58 -08:00 · 96459fe766
commit 96459fe766
parent e7f6c874b7 c5aa46174d
5 changed files with 162 additions and 113 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -321,44 +321,126 @@ def _remove_query(url):
 # XXX chop off path after last slash??
 site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])

-import doublethink
+
+def _mdfind(identifier):
+    import subprocess
+
+    try:
+        result = subprocess.check_output(
+            ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
+        )
+    # Just treat any errors as "couldn't find app"
+    except subprocess.CalledProcessError:
+        return None
+
+    if result:
+        return result.rstrip("\n")
+
+
+def _suggest_default_chrome_exe_mac():
+    import os
+
+    path = None
+    # Try Chromium first, then Chrome
+    result = _mdfind("org.chromium.Chromium")
+    if result is not None:
+        path = f"{result}/Contents/MacOS/Chromium"
+
+    result = _mdfind("com.google.Chrome")
+    if result is not None:
+        path = f"{result}/Contents/MacOS/Google Chrome"
+
+    if path is not None and os.path.exists(path):
+        return path
+
+    # Fall back to default paths if mdfind couldn't find it
+    # (mdfind might fail to find them even in their default paths
+    # if the system has Spotlight disabled.)
+    for path in [
+        "/Applications/Chromium.app/Contents/MacOS/Chromium",
+        "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+    ]:
+        if os.path.exists(path):
+            return path
+
+
+def suggest_default_chrome_exe():
+    import shutil, sys
+
+    # First ask mdfind, which lets us find it in non-default paths
+    if sys.platform == "darwin":
+        path = _suggest_default_chrome_exe_mac()
+        if path is not None:
+            return path
+
+    # "chromium-browser" is the executable on ubuntu trusty
+    # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
+    # google chrome executable names taken from these packages:
+    # http://www.ubuntuupdates.org/ppa/google_chrome
+    for exe in [
+        "chromium-browser",
+        "chromium",
+        "google-chrome",
+        "google-chrome-stable",
+        "google-chrome-beta",
+        "google-chrome-unstable",
+    ]:
+        if shutil.which(exe):
+            return exe
+    return "chromium-browser"
+
+
 import datetime

-EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
+EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)

-# we could make this configurable if there's a good reason
-MAX_PAGE_FAILURES = 3

-from brozzler.worker import BrozzlerWorker
 from brozzler.robots import is_permitted_by_robots
-from brozzler.frontier import RethinkDbFrontier
 from brozzler.browser import Browser, BrowserPool, BrowsingException
-from brozzler.model import (
-    new_job,
-    new_job_file,
-    new_site,
-    Job,
-    Page,
-    Site,
-    InvalidJobConf,
-)
-from brozzler.cli import suggest_default_chrome_exe

 __all__ = [
-    "Page",
-    "Site",
-    "BrozzlerWorker",
    "is_permitted_by_robots",
-    "RethinkDbFrontier",
    "Browser",
    "BrowserPool",
    "BrowsingException",
-    "new_job",
-    "new_site",
-    "Job",
-    "new_job_file",
-    "InvalidJobConf",
    "sleep",
    "thread_accept_exceptions",
    "thread_raise",
+    "suggest_default_chrome_exe",
 ]
+
+try:
+    import doublethink
+
+    # All of these imports use doublethink for real and are unsafe
+    # to do if doublethink is unavailable.
+    from brozzler.worker import BrozzlerWorker
+    from brozzler.frontier import RethinkDbFrontier
+    from brozzler.model import (
+        new_job,
+        new_job_file,
+        new_site,
+        Job,
+        Page,
+        Site,
+        InvalidJobConf,
+    )
+
+    __all__.extend(
+        [
+            "Page",
+            "BrozzlerWorker",
+            "RethinkDbFrontier",
+            "Site",
+            "new_job",
+            "new_site",
+            "Job",
+            "new_job_file",
+            "InvalidJobConf",
+        ]
+    )
+except ImportError:
+    pass
+
+# we could make this configurable if there's a good reason
+MAX_PAGE_FAILURES = 3
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -30,17 +30,17 @@ import doublethink
 import signal
 import string
 import structlog
-import subprocess
 import sys
 import threading
 import time
 import traceback
 import warnings
 import yaml
-import shutil
 import base64
 import rethinkdb as rdb

+from brozzler import suggest_default_chrome_exe
+
 r = rdb.RethinkDB()

 logger = structlog.get_logger(logger_name=__name__)
@ -213,69 +213,6 @@ def configure_logging(args):
    )


-def mdfind(identifier):
-    try:
-        result = subprocess.check_output(
-            ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
-        )
-    # Just treat any errors as "couldn't find app"
-    except subprocess.CalledProcessError:
-        return None
-
-    if result:
-        return result.rstrip("\n")
-
-
-def suggest_default_chrome_exe_mac():
-    path = None
-    # Try Chromium first, then Chrome
-    result = mdfind("org.chromium.Chromium")
-    if result is not None:
-        path = f"{result}/Contents/MacOS/Chromium"
-
-    result = mdfind("com.google.Chrome")
-    if result is not None:
-        path = f"{result}/Contents/MacOS/Google Chrome"
-
-    if path is not None and os.path.exists(path):
-        return path
-
-    # Fall back to default paths if mdfind couldn't find it
-    # (mdfind might fail to find them even in their default paths
-    # if the system has Spotlight disabled.)
-    for path in [
-        "/Applications/Thorium.app/Contents/MacOS/Thorium",
-        "/Applications/Chromium.app/Contents/MacOS/Chromium",
-        "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
-    ]:
-        if os.path.exists(path):
-            return path
-
-
-def suggest_default_chrome_exe():
-    # First ask mdfind, which lets us find it in non-default paths
-    if sys.platform == "darwin":
-        path = suggest_default_chrome_exe_mac()
-        if path is not None:
-            return path
-
-    # "chromium-browser" is the executable on ubuntu trusty
-    # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
-    # google chrome executable names taken from these packages:
-    # http://www.ubuntuupdates.org/ppa/google_chrome
-    for exe in [
-        "chromium-browser",
-        "chromium",
-        "google-chrome",
-        "google-chrome-stable",
-        "google-chrome-beta",
-        "google-chrome-unstable",
-    ]:
-        if shutil.which(exe):
-            return exe
-    return "chromium-browser"
-
-
 class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
    """
    Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -40,7 +40,6 @@ import urlcanon
 from requests.structures import CaseInsensitiveDict
 import rethinkdb as rdb
 from . import metrics
-from . import ydl

 r = rdb.RethinkDB()

@ -95,6 +94,16 @@ class BrozzlerWorker:
        self._skip_extract_outlinks = skip_extract_outlinks
        self._skip_visit_hashtags = skip_visit_hashtags
        self._skip_youtube_dl = skip_youtube_dl
+
+        # We definitely shouldn't ytdlp if the optional extra is missing
+        try:
+            import yt_dlp
+        except ImportError:
+            self.logger.info(
+                "optional yt-dlp extra not installed; setting skip_youtube_dl to True"
+            )
+            self._skip_youtube_dl = True
+
        self._ytdlp_tmpdir = ytdlp_tmpdir
        self._simpler404 = simpler404
        self._screenshot_full_page = screenshot_full_page
@ -260,6 +269,38 @@ class BrozzlerWorker:
        img.save(out, "jpeg", quality=95)
        return out.getbuffer()

+    def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
+        # called only after we've passed needs_browsing() check
+
+        if page_status != 200:
+            logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
+            return False
+        if site.skip_ytdlp:
+            logger.info("skipping ytdlp: site marked skip_ytdlp")
+            return False
+
+        ytdlp_url = page.redirect_url if page.redirect_url else page.url
+
+        if "chrome-error:" in ytdlp_url:
+            return False
+
+        ytdlp_seed = (
+            site["metadata"]["ait_seed_id"]
+            if "metadata" in site and "ait_seed_id" in site["metadata"]
+            else None
+        )
+
+        # TODO: develop UI and refactor
+        if ytdlp_seed:
+            if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
+                logger.info("skipping ytdlp: site in skip_av_seeds")
+                site.skip_ytdlp = True
+                return False
+            else:
+                site.skip_ytdlp = False
+
+        return True
+
    @metrics.brozzler_page_processing_duration_seconds.time()
    @metrics.brozzler_in_progress_pages.track_inprogress()
    def brozzle_page(
@ -303,8 +344,12 @@ class BrozzlerWorker:
            except brozzler.PageInterstitialShown:
                page_logger.info("page interstitial shown (http auth)")

-            if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
+            if enable_youtube_dl and self.should_ytdlp(
+                page_logger, site, page, status_code, set()
+            ):
                try:
+                    from . import ydl
+
                    ydl_outlinks = ydl.do_youtube_dl(
                        self, site, page, self._ytdlp_proxy_endpoints
                    )
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -47,27 +47,6 @@ YTDLP_MAX_REDIRECTS = 5
 logger = structlog.get_logger(logger_name=__name__)


-def should_ytdlp(site, page, page_status):
-    # called only after we've passed needs_browsing() check
-
-    if page_status != 200:
-        logger.info("skipping ytdlp: non-200 page status %s", page_status)
-        return False
-    if site.video_capture in [
-        VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
-        VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
-    ]:
-        logger.info("skipping ytdlp: site has video capture disabled")
-        return False
-
-    ytdlp_url = page.redirect_url if page.redirect_url else page.url
-
-    if "chrome-error:" in ytdlp_url:
-        return False
-
-    return True
-
-
 def isyoutubehost(url):
    # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname
    return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,3 +25,9 @@ Issues = "https://github.com/internetarchive/brozzler/issues"
 [build-system]
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
+
+[dependency-groups]
+dev = [
+  "pytest>=8.3.5",
+  "ruff>=0.9.9"
+]