Merge branch 'gmiller/2950-skip-ytdlp' into qa

2025-07-14 10:39:27 -04:00 · 2024-09-30 17:01:04 -07:00 · 2024-09-30 17:01:04 -07:00 · 04c00d21c5
commit 04c00d21c5
parent eb06474bed 36b17d2a66
5 changed files with 110 additions and 81 deletions
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -585,27 +585,12 @@ def brozzler_worker(argv=None):
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

-    def get_skip_av_seeds():
-        # TODO: develop UI and refactor
-        SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
-        try:
-            # make set from seed IDs in SKIP_AV_SEEDS_FILE
-            with open(SKIP_AV_SEEDS_FILE) as skips:
-                skip_av_seeds = {int(l) for l in skips.readlines()}
-                logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
-        except Exception as e:
-            skip_av_seeds = set()
-            logging.info("running with empty skip_av_seeds")
-        return skip_av_seeds
-
    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
-    skip_av_seeds_from_file = get_skip_av_seeds()
    worker = brozzler.worker.BrozzlerWorker(
        frontier,
        service_registry,
-        skip_av_seeds=skip_av_seeds_from_file,
        max_browsers=int(args.max_browsers),
        chrome_exe=args.chrome_exe,
        proxy=args.proxy,
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -34,6 +34,7 @@ import urllib
 import uuid
 import yaml
 import zlib
+from enum import Enum
 from typing import Optional


@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
+    if "pdfs_only" in job_conf:
+        job.pdfs_only = job_conf["pdfs_only"]
    job.save()

    sites = []
@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
    def populate_defaults(self):
        if not "status" in self:
            self.status = "ACTIVE"
+        if "pdfs_only" not in self:
+            self.pdfs_only = False
        if not "starts_and_stops" in self:
            if self.get("started"):  # backward compatibility
                self.starts_and_stops = [
@ -220,33 +225,53 @@ class Job(doublethink.Document, ElapsedMixIn):
        self.starts_and_stops[-1]["stop"] = doublethink.utcnow()


+class VideoCaptureOptions(Enum):
+    """
+    Enumeration of possible values for the `video_capture` config key.
+        - ENABLE_VIDEO_CAPTURE (default): All video is captured.
+        - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
+          combination of the next two values.
+        - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
+          containing the word "video" is not captured.
+        - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+    Note: Ensuring full video MIME type blocking requires an additional entry in the
+          Warcprox-Meta header `mime-type-filters` key.
+    """
+
+    ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
+    DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
+    BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
+    DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
+
+
 class Site(doublethink.Document, ElapsedMixIn):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    table = "sites"

    def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
            self.status = "ACTIVE"
-        if not "claimed" in self:
+        if "claimed" not in self:
            self.claimed = False
-        if not "last_disclaimed" in self:
+        if "last_disclaimed" not in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
-        if not "last_claimed" in self:
+        if "last_claimed" not in self:
            self.last_claimed = brozzler.EPOCH_UTC
-        if not "scope" in self:
+        if "scope" not in self:
            self.scope = {}
-        if not "skip_ytdlp" in self:
-            self.skip_ytdlp = None
+        if "video_capture" not in self:
+            self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value

        # backward compatibility
        if "surt" in self.scope:
-            if not "accepts" in self.scope:
+            if "accepts" not in self.scope:
                self.scope["accepts"] = []
            self.scope["accepts"].append({"surt": self.scope["surt"]})
            del self.scope["surt"]

        # backward compatibility
-        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
+        if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]
@ -256,7 +281,7 @@ class Site(doublethink.Document, ElapsedMixIn):
                brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
            )

-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
            if self.get("start_time"):  # backward compatibility
                self.starts_and_stops = [
                    {"start": self.get("start_time"), "stop": None}
@ -271,7 +296,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)

    def _accept_ssurt_if_not_redundant(self, ssurt):
-        if not "accepts" in self.scope:
+        if "accepts" not in self.scope:
            self.scope["accepts"] = []
        simple_rule_ssurts = (
            rule["ssurt"]
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -21,6 +21,7 @@ limitations under the License.
 import logging
 import brozzler
 import brozzler.browser
+from brozzler.model import VideoCaptureOptions
 import threading
 import time
 import urllib.request
@ -55,7 +56,6 @@ class BrozzlerWorker:
        self,
        frontier,
        service_registry=None,
-        skip_av_seeds=None,
        max_browsers=1,
        chrome_exe="chromium-browser",
        warcprox_auto=False,
@ -78,7 +78,6 @@ class BrozzlerWorker:
    ):
        self._frontier = frontier
        self._service_registry = service_registry
-        self._skip_av_seeds = skip_av_seeds
        self._max_browsers = max_browsers

        self._warcprox_auto = warcprox_auto
@ -268,7 +267,17 @@ class BrozzlerWorker:

        if not self._needs_browsing(page_headers):
            self.logger.info("needs fetch: %s", page)
-            self._fetch_url(site, page=page)
+            if site.pdfs_only and not self._is_pdf(page_headers):
+                self.logger.info("skipping non-PDF content: PDFs only option enabled")
+            elif site.video_capture in [
+                VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+                VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
+            ] and self._is_video_type(page_headers):
+                self.logger.info(
+                    "skipping video content: video MIME type capture disabled for site"
+                )
+            else:
+                self._fetch_url(site, page=page)
        else:
            self.logger.info("needs browsing: %s", page)
            try:
@ -280,7 +289,7 @@ class BrozzlerWorker:
                self.logger.info("page interstitial shown (http auth): %s", page)

            if enable_youtube_dl and ydl.should_ytdlp(
-                site, page, browser.websock_thread.page_status, self._skip_av_seeds
+                site, page, browser.websock_thread.page_status
            ):
                try:
                    ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@ -327,13 +336,29 @@ class BrozzlerWorker:
            self.logger.warning("Failed to get headers for %s: %s", page.url, e)
            return {}

-    def _needs_browsing(self, page_headers):
-        if (
+    def _needs_browsing(self, page_headers) -> bool:
+        return not bool(
            "content-type" in page_headers
            and "html" not in page_headers["content-type"]
-        ):
-            return False
-        return True
+        )
+
+    def _is_video_type(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it contains
+        a video.
+        """
+        return (
+            "content-type" in page_headers and "video" in page_headers["content-type"]
+        )
+
+    def _is_pdf(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it is a PDF.
+        """
+        return (
+            "content-type" in page_headers
+            and "application/pdf" in page_headers["content-type"]
+        )

    @metrics.brozzler_page_processing_duration_seconds.time()
    @metrics.brozzler_in_progress_pages.track_inprogress()
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -20,6 +20,7 @@ import logging
 import yt_dlp
 from yt_dlp.utils import match_filter_func
 import brozzler
+from brozzler.model import VideoCaptureOptions
 import urllib.request
 import tempfile
 import urlcanon
@ -58,14 +59,17 @@ def _timestamp4datetime(timestamp):
        int(timestamp[-2:])
        )

-def should_ytdlp(site, page, page_status, skip_av_seeds):
+def should_ytdlp(site, page, page_status):
    # called only after we've passed needs_browsing() check

    if page_status != 200:
        logging.info("skipping ytdlp: non-200 page status %s", page_status)
        return False
-    if site.skip_ytdlp:
-        logging.info("skipping ytdlp: site marked skip_ytdlp")
+    if site.video_capture in [
+        VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+        VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
+    ]:
+        logging.info("skipping ytdlp: site has video capture disabled")
        return False

    ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -73,49 +77,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
    if "chrome-error:" in ytdlp_url:
        return False

-    ytdlp_seed = (
-        site["metadata"]["ait_seed_id"]
-        if "metadata" in site and "ait_seed_id" in site["metadata"]
-        else None
-    )
-
-    # TODO: develop UI and refactor
-    if ytdlp_seed:
-        if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
-            logging.info("skipping ytdlp: site in skip_av_seeds")
-            site.skip_ytdlp = True
-            return False
-        else:
-            site.skip_ytdlp = False
-
-    logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)
-
-    if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
-        logging.info("found youtube watch page %r", ytdlp_url)
-        # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
-        cluster = Cluster(["207.241.235.189"], protocol_version=5)
-        session = cluster.connect("video")
-        containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
-        future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", str(urlcanon.aggressive(ytdlp_url))])
-        try:
-            rows = future.result()
-        except ReadTimeout:
-            logging.exception("Query timed out:")
-
-        if len(rows.current_rows) == 0:
-            logging.info("no results returned from videos query")
-            return True
-
-        for row in rows:
-            logging.info("video query found %r", row)
-            ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp))
-            logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
-            time_diff = datetime.datetime.now() - ytdlp_timestamp
-            # TODO: make variable for timedelta
-            if time_diff < datetime.timedelta(days = 90):
-                logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url)
-                return False
-
    return True

 def isyoutubehost(url):
--- a/job-conf.rst
+++ b/job-conf.rst
@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
 simultaneously across the cluster. Addresses the problem of a job with many
 seeds starving out other jobs.

+``pdfs_only``
+~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
+| type    | required | default   |
+=========+==========+===========+
+| boolean | no       | ``false`` |
+---------+----------+-----------+
+Limits capture to PDFs based on the MIME type set in the HTTP response's
+Content-Type header. This value only impacts processing of outlinks within
+Brozzler.
+
+*Note: Ensuring comprehensive limiting to only PDFs requires an additional
+entry in the Warcprox-Meta header `mime-type-filters` key.*
+
 ``seeds``
 ~~~~~~~~~
 +------------------------+----------+---------+
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
 the default values in place. Brozzler submits login forms after page load.
 Then brozzling proceeds as usual.

+``video_capture``
+~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
+| type   | required | default                  |
+========+==========+==========================+
+| string | yes      | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
+Determines the level of video capture for the seed. This is an enumeration with four possible values:
+
+* ENABLE_VIDEO_CAPTURE (default): All video is captured.
+* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
+  combination of the next two values.
+* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
+  the word "video" is not captured.
+* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+*Note: Ensuring full video MIME type blocking requires an additional entry in
+the Warcprox-Meta header `mime-type-filters` key.*
+
 Seed-level / top-level settings
 -------------------------------
 These are seed settings that can also be specified at the top level, in which