From 1bbb3197e8fdcff0c1539cb8a4c4da5c70e6b73a Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Thu, 12 Sep 2024 16:45:52 -0700 Subject: [PATCH] WT-2950 Implement Seed-level video capture setting handling + Job-level PDF-only option --- brozzler/cli.py | 17 ---------- brozzler/job_schema.yaml | 5 +++ brozzler/model.py | 29 +++++++++++++++-- brozzler/worker.py | 68 +++++++++++++++++++++++----------------- job-conf.rst | 33 +++++++++++++++++++ 5 files changed, 105 insertions(+), 47 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index f18ad14..d5b7093 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -664,21 +664,6 @@ def brozzler_worker(argv=None): finally: signal.signal(signal.SIGQUIT, dump_state) - def get_skip_av_seeds(): - # TODO: develop UI and refactor - SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt" - try: - # make set from seed IDs in SKIP_AV_SEEDS_FILE - with open(SKIP_AV_SEEDS_FILE) as skips: - skip_av_seeds = {int(line) for line in skips.readlines()} - logger.info( - "running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE - ) - except Exception: - skip_av_seeds = set() - logger.info("running with empty skip_av_seeds") - return skip_av_seeds - def get_ytdlp_proxy_endpoints(): YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file try: @@ -698,12 +683,10 @@ def brozzler_worker(argv=None): rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) - skip_av_seeds_from_file = get_skip_av_seeds() ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, - skip_av_seeds=skip_av_seeds_from_file, ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 6069de8..59b831f 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -95,8 +95,13 @@ seeds: password: type: string + video_capture: + type: string + <<: *multi_level_options max_claimed_sites: type: integer +pdfs_only: + type: boolean diff --git a/brozzler/model.py b/brozzler/model.py index 38b371a..eab4b1c 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -25,6 +25,7 @@ import os import urllib import uuid import zlib +from enum import Enum from typing import Optional import cerberus @@ -101,6 +102,8 @@ def new_job(frontier, job_conf): job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] + if "pdfs_only" in job_conf: + job.pdfs_only = job_conf["pdfs_only"] job.save() sites = [] @@ -199,6 +202,8 @@ class Job(doublethink.Document, ElapsedMixIn): def populate_defaults(self): if "status" not in self: self.status = "ACTIVE" + if "pdfs_only" not in self: + self.pdfs_only = False if "starts_and_stops" not in self: if self.get("started"): # backward compatibility self.starts_and_stops = [ @@ -221,6 +226,26 @@ class Job(doublethink.Document, ElapsedMixIn): self.starts_and_stops[-1]["stop"] = doublethink.utcnow() +class VideoCaptureOptions(Enum): + """ + Enumeration of possible values for the `video_capture` config key. + - ENABLE_VIDEO_CAPTURE (default): All video is captured. + - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. + - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header + containing the word "video" is not captured. + - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + + Note: Ensuring full video MIME type blocking requires an additional entry in the + Warcprox-Meta header `mime-type-filters` key. + """ + + ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" + DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE" + BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES" + DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE" + + class Site(doublethink.Document, ElapsedMixIn): logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__) table = "sites" @@ -236,8 +261,8 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if "scope" not in self: self.scope = {} - if "skip_ytdlp" not in self: - self.skip_ytdlp = None + if "video_capture" not in self: + self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value # backward compatibility if "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 807195b..4c75f81 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -38,6 +38,7 @@ from urllib3.exceptions import ProxyError, TimeoutError import brozzler import brozzler.browser +from brozzler.model import VideoCaptureOptions from . import metrics @@ -60,7 +61,6 @@ class BrozzlerWorker: self, frontier, service_registry=None, - skip_av_seeds=None, ytdlp_proxy_endpoints=None, max_browsers=1, chrome_exe="chromium-browser", @@ -85,7 +85,6 @@ class BrozzlerWorker: ): self._frontier = frontier self._service_registry = service_registry - self._skip_av_seeds = skip_av_seeds self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints self._max_browsers = max_browsers @@ -274,36 +273,23 @@ class BrozzlerWorker: img.save(out, "jpeg", quality=95) return out.getbuffer() - def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds): + def should_ytdlp(self, logger, site, page, page_status): # called only after we've passed needs_browsing() check if page_status != 200: logger.info("skipping ytdlp: non-200 page status", page_status=page_status) return False - if site.skip_ytdlp: - logger.info("skipping ytdlp: site marked skip_ytdlp") - return False + if site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value, + ]: + logger.info("skipping ytdlp: site has video capture disabled") ytdlp_url = page.redirect_url if page.redirect_url else page.url if "chrome-error:" in ytdlp_url: return False - ytdlp_seed = ( - site["metadata"]["ait_seed_id"] - if "metadata" in site and "ait_seed_id" in site["metadata"] - else None - ) - - # TODO: develop UI and refactor - if ytdlp_seed: - if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: - logger.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - else: - site.skip_ytdlp = False - return True @metrics.brozzler_page_processing_duration_seconds.time() @@ -325,7 +311,17 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): page_logger.info("needs fetch") - self._fetch_url(site, page=page) + if site.pdfs_only and not self._is_pdf(page_headers): + self.logger.info("skipping non-PDF content: PDFs only option enabled") + elif site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, + ] and self._is_video_type(page_headers): + self.logger.info( + "skipping video content: video MIME type capture disabled for site" + ) + else: + self._fetch_url(site, page=page) else: page_logger.info("needs browsing") try: @@ -340,7 +336,7 @@ class BrozzlerWorker: page_logger.info("page interstitial shown (http auth)") if enable_youtube_dl and self.should_ytdlp( - page_logger, site, page, status_code, self._skip_av_seeds + page_logger, site, page, status_code ): try: from . import ydl @@ -399,13 +395,29 @@ class BrozzlerWorker: url_logger.warning("Failed to get headers", exc_info=True) return {} - def _needs_browsing(self, page_headers): - if ( + def _needs_browsing(self, page_headers) -> bool: + return not bool( "content-type" in page_headers and "html" not in page_headers["content-type"] - ): - return False - return True + ) + + def _is_video_type(self, page_headers) -> bool: + """ + Determines if the page's Content-Type header specifies that it contains + a video. + """ + return ( + "content-type" in page_headers and "video" in page_headers["content-type"] + ) + + def _is_pdf(self, page_headers) -> bool: + """ + Determines if the page's Content-Type header specifies that it is a PDF. + """ + return ( + "content-type" in page_headers + and "application/pdf" in page_headers["content-type"] + ) @metrics.brozzler_browsing_duration_seconds.time() @metrics.brozzler_in_progress_browses.track_inprogress() diff --git a/job-conf.rst b/job-conf.rst index 08707b6..5378bac 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled simultaneously across the cluster. Addresses the problem of a job with many seeds starving out other jobs. +``pdfs_only`` +~~~~~~~~~~~~~~~~~~~~~ ++---------+----------+-----------+ +| type | required | default | ++=========+==========+===========+ +| boolean | no | ``false`` | ++---------+----------+-----------+ +Limits capture to PDFs based on the MIME type set in the HTTP response's +Content-Type header. This value only impacts processing of outlinks within +Brozzler. + +*Note: Ensuring comprehensive limiting to only PDFs requires an additional +entry in the Warcprox-Meta header `mime-type-filters` key.* + ``seeds`` ~~~~~~~~~ +------------------------+----------+---------+ @@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave the default values in place. Brozzler submits login forms after page load. Then brozzling proceeds as usual. +``video_capture`` +~~~~~~~~~~~~~~~~~ ++--------+----------+--------------------------+ +| type | required | default | ++========+==========+==========================+ +| string | yes | ``ENABLE_VIDEO_CAPTURE`` | ++--------+----------+--------------------------+ +Determines the level of video capture for the seed. This is an enumeration with four possible values: + +* ENABLE_VIDEO_CAPTURE (default): All video is captured. +* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. +* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing + the word "video" is not captured. +* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + +*Note: Ensuring full video MIME type blocking requires an additional entry in +the Warcprox-Meta header `mime-type-filters` key.* + Seed-level / top-level settings ------------------------------- These are seed settings that can also be specified at the top level, in which