Merge branch 'gmiller/2950-skip-ytdlp' into qa

2025-02-25 00:59:52 -05:00 · 2024-09-30 17:01:04 -07:00 · 2024-09-30 17:01:04 -07:00 · 04c00d21c5
commit 04c00d21c5
parent eb06474bed 36b17d2a66
5 changed files with 110 additions and 81 deletions
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -585,27 +585,12 @@ def brozzler_worker(argv=None):
        finally:
            signal.signal(signal.SIGQUIT, dump_state)
    def get_skip_av_seeds():
        # TODO: develop UI and refactor
        SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
        try:
            # make set from seed IDs in SKIP_AV_SEEDS_FILE
            with open(SKIP_AV_SEEDS_FILE) as skips:
                skip_av_seeds = {int(l) for l in skips.readlines()}
                logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
        except Exception as e:
            skip_av_seeds = set()
            logging.info("running with empty skip_av_seeds")
        return skip_av_seeds
    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
    skip_av_seeds_from_file = get_skip_av_seeds()
    worker = brozzler.worker.BrozzlerWorker(
        frontier,
        service_registry,
        skip_av_seeds=skip_av_seeds_from_file,
        max_browsers=int(args.max_browsers),
        chrome_exe=args.chrome_exe,
        proxy=args.proxy,
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -34,6 +34,7 @@ import urllib
 import uuid
 import yaml
 import zlib
 from enum import Enum
 from typing import Optional
@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
        job.id = job_conf["id"]
    if "max_claimed_sites" in job_conf:
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    if "pdfs_only" in job_conf:
        job.pdfs_only = job_conf["pdfs_only"]
    job.save()
    sites = []
@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
    def populate_defaults(self):
        if not "status" in self:
            self.status = "ACTIVE"
        if "pdfs_only" not in self:
            self.pdfs_only = False
        if not "starts_and_stops" in self:
            if self.get("started"):  # backward compatibility
                self.starts_and_stops = [
@ -220,33 +225,53 @@ class Job(doublethink.Document, ElapsedMixIn):
        self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
 class VideoCaptureOptions(Enum):
    """
    Enumeration of possible values for the `video_capture` config key.
        - ENABLE_VIDEO_CAPTURE (default): All video is captured.
        - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
          combination of the next two values.
        - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
          containing the word "video" is not captured.
        - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
    Note: Ensuring full video MIME type blocking requires an additional entry in the
          Warcprox-Meta header `mime-type-filters` key.
    """
    ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
    DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
    BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
    DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
 class Site(doublethink.Document, ElapsedMixIn):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    table = "sites"
    def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
            self.status = "ACTIVE"
-        if not "claimed" in self:
+        if "claimed" not in self:
            self.claimed = False
-        if not "last_disclaimed" in self:
+        if "last_disclaimed" not in self:
            self.last_disclaimed = brozzler.EPOCH_UTC
-        if not "last_claimed" in self:
+        if "last_claimed" not in self:
            self.last_claimed = brozzler.EPOCH_UTC
-        if not "scope" in self:
+        if "scope" not in self:
            self.scope = {}
-        if not "skip_ytdlp" in self:
+        if "video_capture" not in self:
-            self.skip_ytdlp = None
+            self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
        # backward compatibility
        if "surt" in self.scope:
-            if not "accepts" in self.scope:
+            if "accepts" not in self.scope:
                self.scope["accepts"] = []
            self.scope["accepts"].append({"surt": self.scope["surt"]})
            del self.scope["surt"]
        # backward compatibility
-        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
+        if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
        if "max_hops_off_surt" in self.scope:
            del self.scope["max_hops_off_surt"]
@ -256,7 +281,7 @@ class Site(doublethink.Document, ElapsedMixIn):
                brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
            )
-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
            if self.get("start_time"):  # backward compatibility
                self.starts_and_stops = [
                    {"start": self.get("start_time"), "stop": None}
@ -271,7 +296,7 @@ class Site(doublethink.Document, ElapsedMixIn):
        return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
    def _accept_ssurt_if_not_redundant(self, ssurt):
-        if not "accepts" in self.scope:
+        if "accepts" not in self.scope:
            self.scope["accepts"] = []
        simple_rule_ssurts = (
            rule["ssurt"]
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -21,6 +21,7 @@ limitations under the License.
 import logging
 import brozzler
 import brozzler.browser
 from brozzler.model import VideoCaptureOptions
 import threading
 import time
 import urllib.request
@ -55,7 +56,6 @@ class BrozzlerWorker:
        self,
        frontier,
        service_registry=None,
        skip_av_seeds=None,
        max_browsers=1,
        chrome_exe="chromium-browser",
        warcprox_auto=False,
@ -78,7 +78,6 @@ class BrozzlerWorker:
    ):
        self._frontier = frontier
        self._service_registry = service_registry
        self._skip_av_seeds = skip_av_seeds
        self._max_browsers = max_browsers
        self._warcprox_auto = warcprox_auto
@ -268,7 +267,17 @@ class BrozzlerWorker:
        if not self._needs_browsing(page_headers):
            self.logger.info("needs fetch: %s", page)
-            self._fetch_url(site, page=page)
+            if site.pdfs_only and not self._is_pdf(page_headers):
                self.logger.info("skipping non-PDF content: PDFs only option enabled")
            elif site.video_capture in [
                VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
                VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
            ] and self._is_video_type(page_headers):
                self.logger.info(
                    "skipping video content: video MIME type capture disabled for site"
                )
            else:
                self._fetch_url(site, page=page)
        else:
            self.logger.info("needs browsing: %s", page)
            try:
@ -280,7 +289,7 @@ class BrozzlerWorker:
                self.logger.info("page interstitial shown (http auth): %s", page)
            if enable_youtube_dl and ydl.should_ytdlp(
-                site, page, browser.websock_thread.page_status, self._skip_av_seeds
+                site, page, browser.websock_thread.page_status
            ):
                try:
                    ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@ -327,13 +336,29 @@ class BrozzlerWorker:
            self.logger.warning("Failed to get headers for %s: %s", page.url, e)
            return {}
-    def _needs_browsing(self, page_headers):
+    def _needs_browsing(self, page_headers) -> bool:
-        if (
+        return not bool(
            "content-type" in page_headers
            and "html" not in page_headers["content-type"]
-        ):
+        )
-            return False
+
-        return True
+    def _is_video_type(self, page_headers) -> bool:
        """
        Determines if the page's Content-Type header specifies that it contains
        a video.
        """
        return (
            "content-type" in page_headers and "video" in page_headers["content-type"]
        )
    def _is_pdf(self, page_headers) -> bool:
        """
        Determines if the page's Content-Type header specifies that it is a PDF.
        """
        return (
            "content-type" in page_headers
            and "application/pdf" in page_headers["content-type"]
        )
    @metrics.brozzler_page_processing_duration_seconds.time()
    @metrics.brozzler_in_progress_pages.track_inprogress()
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -20,6 +20,7 @@ import logging
 import yt_dlp
 from yt_dlp.utils import match_filter_func
 import brozzler
 from brozzler.model import VideoCaptureOptions
 import urllib.request
 import tempfile
 import urlcanon
@ -58,14 +59,17 @@ def _timestamp4datetime(timestamp):
        int(timestamp[-2:])
        )
-def should_ytdlp(site, page, page_status, skip_av_seeds):
+def should_ytdlp(site, page, page_status):
    # called only after we've passed needs_browsing() check
    if page_status != 200:
        logging.info("skipping ytdlp: non-200 page status %s", page_status)
        return False
-    if site.skip_ytdlp:
+    if site.video_capture in [
-        logging.info("skipping ytdlp: site marked skip_ytdlp")
+        VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
        VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
    ]:
        logging.info("skipping ytdlp: site has video capture disabled")
        return False
    ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -73,49 +77,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
    if "chrome-error:" in ytdlp_url:
        return False
    ytdlp_seed = (
        site["metadata"]["ait_seed_id"]
        if "metadata" in site and "ait_seed_id" in site["metadata"]
        else None
    )
    # TODO: develop UI and refactor
    if ytdlp_seed:
        if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
            logging.info("skipping ytdlp: site in skip_av_seeds")
            site.skip_ytdlp = True
            return False
        else:
            site.skip_ytdlp = False
    logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)
    if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
        logging.info("found youtube watch page %r", ytdlp_url)
        # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
        cluster = Cluster(["207.241.235.189"], protocol_version=5)
        session = cluster.connect("video")
        containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
        future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", str(urlcanon.aggressive(ytdlp_url))])
        try:
            rows = future.result()
        except ReadTimeout:
            logging.exception("Query timed out:")
        if len(rows.current_rows) == 0:
            logging.info("no results returned from videos query")
            return True
        for row in rows:
            logging.info("video query found %r", row)
            ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp))
            logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
            time_diff = datetime.datetime.now() - ytdlp_timestamp
            # TODO: make variable for timedelta
            if time_diff < datetime.timedelta(days = 90):
                logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url)
                return False
    return True
 def isyoutubehost(url):
--- a/job-conf.rst
+++ b/job-conf.rst
@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
 simultaneously across the cluster. Addresses the problem of a job with many
 seeds starving out other jobs.
 ``pdfs_only``
 ~~~~~~~~~~~~~~~~~~~~~
 +---------+----------+-----------+
 | type    | required | default   |
 +=========+==========+===========+
 | boolean | no       | ``false`` |
 +---------+----------+-----------+
 Limits capture to PDFs based on the MIME type set in the HTTP response's
 Content-Type header. This value only impacts processing of outlinks within
 Brozzler.
 *Note: Ensuring comprehensive limiting to only PDFs requires an additional
 entry in the Warcprox-Meta header `mime-type-filters` key.*
 ``seeds``
 ~~~~~~~~~
 +------------------------+----------+---------+
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
 the default values in place. Brozzler submits login forms after page load.
 Then brozzling proceeds as usual.
 ``video_capture``
 ~~~~~~~~~~~~~~~~~
 +--------+----------+--------------------------+
 | type   | required | default                  |
 +========+==========+==========================+
 | string | yes      | ``ENABLE_VIDEO_CAPTURE`` |
 +--------+----------+--------------------------+
 Determines the level of video capture for the seed. This is an enumeration with four possible values:
 * ENABLE_VIDEO_CAPTURE (default): All video is captured.
 * DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
  combination of the next two values.
 * BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
  the word "video" is not captured.
 * DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
 *Note: Ensuring full video MIME type blocking requires an additional entry in
 the Warcprox-Meta header `mime-type-filters` key.*
 Seed-level / top-level settings
 -------------------------------
 These are seed settings that can also be specified at the top level, in which