Merge a6c827ebb9ddb447c49032ac4be1b6c7bcd62f26 into 42b4a88c963eb480b3c19117c19eac7e146fa8ff

2025-04-20 15:55:49 -04:00 · 2025-04-05 22:39:21 +00:00 · 2025-04-05 22:39:21 +00:00 · 26abedb28b
commit 26abedb28b
parent 42b4a88c96 a6c827ebb9
6 changed files with 105 additions and 46 deletions
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -665,21 +665,6 @@ def brozzler_worker(argv=None):
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

-    def get_skip_av_seeds():
-        # TODO: develop UI and refactor
-        SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
-        try:
-            # make set from seed IDs in SKIP_AV_SEEDS_FILE
-            with open(SKIP_AV_SEEDS_FILE) as skips:
-                skip_av_seeds = {int(line) for line in skips.readlines()}
-                logger.info(
-                    "running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
-                )
-        except Exception:
-            skip_av_seeds = set()
-            logger.info("running with empty skip_av_seeds")
-        return skip_av_seeds
-
    def get_ytdlp_proxy_endpoints():
        YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file
        try:
@ -699,12 +684,10 @@ def brozzler_worker(argv=None):
    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
-    skip_av_seeds_from_file = get_skip_av_seeds()
    ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints()
    worker = brozzler.worker.BrozzlerWorker(
        frontier,
        service_registry,
-        skip_av_seeds=skip_av_seeds_from_file,
        ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file,
        max_browsers=int(args.max_browsers),
        chrome_exe=args.chrome_exe,
--- a/brozzler/job_schema.yaml
+++ b/brozzler/job_schema.yaml
@ -95,8 +95,13 @@ seeds:
      password:
        type: string

+      video_capture:
+        type: string
+
      <<: *multi_level_options

 max_claimed_sites:
  type: integer

+pdfs_only:
+  type: boolean
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -25,6 +25,7 @@ import os
 import urllib
 import uuid
 import zlib
+from enum import Enum
 from typing import Optional

 import cerberus
@ -101,6 +102,8 @@ def new_job(frontier, job_conf):
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
+    if "pdfs_only" in job_conf:
+        job.pdfs_only = job_conf["pdfs_only"]
    job.save()

    sites = []
@ -199,6 +202,8 @@ class Job(doublethink.Document, ElapsedMixIn):
    def populate_defaults(self):
        if "status" not in self:
            self.status = "ACTIVE"
+        if "pdfs_only" not in self:
+            self.pdfs_only = False
        if "starts_and_stops" not in self:
            if self.get("started"):  # backward compatibility
                self.starts_and_stops = [
@ -221,6 +226,26 @@ class Job(doublethink.Document, ElapsedMixIn):
        self.starts_and_stops[-1]["stop"] = doublethink.utcnow()


+class VideoCaptureOptions(Enum):
+    """
+    Enumeration of possible values for the `video_capture` config key.
+        - ENABLE_VIDEO_CAPTURE (default): All video is captured.
+        - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
+          combination of the next two values.
+        - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
+          containing the word "video" is not captured.
+        - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+    Note: Ensuring full video MIME type blocking requires an additional entry in the
+          Warcprox-Meta header `mime-type-filters` key.
+    """
+
+    ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
+    DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
+    BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
+    DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
+
+
 class Site(doublethink.Document, ElapsedMixIn):
    logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
    table = "sites"
@ -236,8 +261,8 @@ class Site(doublethink.Document, ElapsedMixIn):
            self.last_claimed = brozzler.EPOCH_UTC
        if "scope" not in self:
            self.scope = {}
-        if "skip_ytdlp" not in self:
-            self.skip_ytdlp = None
+        if "video_capture" not in self:
+            self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value

        # backward compatibility
        if "surt" in self.scope:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -38,6 +38,7 @@ from urllib3.exceptions import ProxyError, TimeoutError

 import brozzler
 import brozzler.browser
+from brozzler.model import VideoCaptureOptions

 from . import metrics

@ -60,7 +61,6 @@ class BrozzlerWorker:
        self,
        frontier,
        service_registry=None,
-        skip_av_seeds=None,
        ytdlp_proxy_endpoints=None,
        max_browsers=1,
        chrome_exe="chromium-browser",
@ -86,7 +86,6 @@ class BrozzlerWorker:
    ):
        self._frontier = frontier
        self._service_registry = service_registry
-        self._skip_av_seeds = skip_av_seeds
        self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints
        self._max_browsers = max_browsers

@ -278,14 +277,17 @@ class BrozzlerWorker:
        img.save(out, "jpeg", quality=95)
        return out.getbuffer()

-    def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
+    def should_ytdlp(self, logger, site, page, page_status):
        # called only after we've passed needs_browsing() check

        if page_status != 200:
            logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
            return False
-        if site.skip_ytdlp:
-            logger.info("skipping ytdlp: site marked skip_ytdlp")
+        if site.video_capture in [
+            VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+            VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
+        ]:
+            logger.info("skipping ytdlp: site has video capture disabled")
            return False

        ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -293,21 +295,6 @@ class BrozzlerWorker:
        if "chrome-error:" in ytdlp_url:
            return False

-        ytdlp_seed = (
-            site["metadata"]["ait_seed_id"]
-            if "metadata" in site and "ait_seed_id" in site["metadata"]
-            else None
-        )
-
-        # TODO: develop UI and refactor
-        if ytdlp_seed:
-            if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
-                logger.info("skipping ytdlp: site in skip_av_seeds")
-                site.skip_ytdlp = True
-                return False
-            else:
-                site.skip_ytdlp = False
-
        return True

    @metrics.brozzler_page_processing_duration_seconds.time()
@ -329,7 +316,17 @@ class BrozzlerWorker:

        if not self._needs_browsing(page_headers):
            page_logger.info("needs fetch")
-            self._fetch_url(site, page=page)
+            if site.pdfs_only and not self._is_pdf(page_headers):
+                page_logger.info("skipping non-PDF content: PDFs only option enabled")
+            elif site.video_capture in [
+                VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+                VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
+            ] and self._is_video_type(page_headers):
+                page_logger.info(
+                    "skipping video content: video MIME type capture disabled for site"
+                )
+            else:
+                self._fetch_url(site, page=page)
        else:
            page_logger.info("needs browsing")
            try:
@ -344,7 +341,7 @@ class BrozzlerWorker:
                page_logger.info("page interstitial shown (http auth)")

            if enable_youtube_dl and self.should_ytdlp(
-                page_logger, site, page, status_code, self._skip_av_seeds
+                page_logger, site, page, status_code
            ):
                try:
                    from . import ydl
@ -403,13 +400,29 @@ class BrozzlerWorker:
            url_logger.warning("Failed to get headers", exc_info=True)
        return {}

-    def _needs_browsing(self, page_headers):
-        if (
+    def _needs_browsing(self, page_headers) -> bool:
+        return not (
            "content-type" in page_headers
            and "html" not in page_headers["content-type"]
-        ):
-            return False
-        return True
+        )
+
+    def _is_video_type(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it contains
+        a video.
+        """
+        return (
+            "content-type" in page_headers and "video" in page_headers["content-type"]
+        )
+
+    def _is_pdf(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it is a PDF.
+        """
+        return (
+            "content-type" in page_headers
+            and "application/pdf" in page_headers["content-type"]
+        )

    @metrics.brozzler_browsing_duration_seconds.time()
    @metrics.brozzler_in_progress_browses.track_inprogress()
--- a/dev/pre-commit
+++ b/dev/pre-commit
--- a/job-conf.rst
+++ b/job-conf.rst
@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
 simultaneously across the cluster. Addresses the problem of a job with many
 seeds starving out other jobs.

+``pdfs_only``
+~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
+| type    | required | default   |
+=========+==========+===========+
+| boolean | no       | ``false`` |
+---------+----------+-----------+
+Limits capture to PDFs based on the MIME type set in the HTTP response's
+Content-Type header. This value only impacts processing of outlinks within
+Brozzler.
+
+*Note: Ensuring comprehensive limiting to only PDFs requires an additional
+entry in the Warcprox-Meta header `mime-type-filters` key.*
+
 ``seeds``
 ~~~~~~~~~
 +------------------------+----------+---------+
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
 the default values in place. Brozzler submits login forms after page load.
 Then brozzling proceeds as usual.

+``video_capture``
+~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
+| type   | required | default                  |
+========+==========+==========================+
+| string | yes      | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
+Determines the level of video capture for the seed. This is an enumeration with four possible values:
+
+* ENABLE_VIDEO_CAPTURE (default): All video is captured.
+* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
+  combination of the next two values.
+* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
+  the word "video" is not captured.
+* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+*Note: Ensuring full video MIME type blocking requires an additional entry in
+the Warcprox-Meta header `mime-type-filters` key.*
+
 Seed-level / top-level settings
 -------------------------------
 These are seed settings that can also be specified at the top level, in which