WT-2950 cleaning up video capture options handling; PDFs only handling on outlinks

2025-04-20 23:56:34 -04:00 · 2024-09-20 14:17:07 -07:00 · 2024-09-20 14:17:07 -07:00 · dca9630982
commit dca9630982
parent 8275f3ea16
3 changed files with 39 additions and 7 deletions
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -222,8 +222,21 @@ class Job(doublethink.Document, ElapsedMixIn):


 class VideoCaptureOptions(Enum):
+    """
+    Enumeration of possible values for the `video_capture` config key.
+        - ENABLE_VIDEO_CAPTURE: All video is captured.
+        - DISABLE_VIDEO_CAPTURE: No video is captured.
+        - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
+          containing the word "video" is not captured.
+        - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+    Note: Ensuring full video MIME type blocking requires an entry in the
+          Warcprox-Meta header `mime-type-filters` key to fully block videos by
+          MIME type.
+    """
    ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
-    LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE"
+    DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
+    BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
    DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"


--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -249,12 +249,16 @@ class BrozzlerWorker:

        if not self._needs_browsing(page_headers):
            self.logger.info("needs fetch: %s", page)
-            if (
-                self._is_video_type(page_headers)
-                and site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
-            ):
+            if site.video_capture in [
+                VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+                VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
+            ] and self._is_video_type(page_headers):
                self.logger.info(
-                    "skipping video content: video capture disabled for site"
+                    "skipping video content: video MIME type capture disabled for site"
+                )
+            elif site.pdfs_only and not self._is_pdf(page_headers):
+                self.logger.info(
+                    "skipping non-PDF content: PDFs only option enabled"
                )
            else:
                self._fetch_url(site, page=page)
@ -317,10 +321,22 @@ class BrozzlerWorker:
        )

    def _is_video_type(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it contains
+        a video.
+        """
        return (
            "content-type" in page_headers and "video" in page_headers["content-type"]
        )

+    def _is_pdf(self, page_headers) -> bool:
+        """
+        Determinse if the page's Content-Type header specifies that it is a PDF.
+        """
+        return (
+            "content-type" in page_headers and "application/pdf" in page_headers["content-type"]
+        )
+
    def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
        def _on_screenshot(screenshot_jpeg):
            if on_screenshot:
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -39,7 +39,10 @@ def should_ytdlp(site, page, page_status):
    if page_status != 200:
        logging.info("skipping ytdlp: non-200 page status %s", page_status)
        return False
-    if site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value:
+    if site.video_capture in [
+        VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+        VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
+    ]:
        logging.info("skipping ytdlp: site has video capture disabled")
        return False