From d9ed5c434aacc8dce526ff33695b36a3a0a05ca9 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Thu, 12 Sep 2024 16:45:52 -0700 Subject: [PATCH 01/12] WT-2950 remove skip_av_seeds --- brozzler/cli.py | 15 --------------- brozzler/worker.py | 4 +--- brozzler/ydl.py | 17 +---------------- 3 files changed, 2 insertions(+), 34 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index bea5153..d896f7b 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -544,27 +544,12 @@ def brozzler_worker(argv=None): finally: signal.signal(signal.SIGQUIT, dump_state) - def get_skip_av_seeds(): - # TODO: develop UI and refactor - SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt" - try: - # make set from seed IDs in SKIP_AV_SEEDS_FILE - with open(SKIP_AV_SEEDS_FILE) as skips: - skip_av_seeds = {int(l) for l in skips.readlines()} - logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE) - except Exception as e: - skip_av_seeds = set() - logging.info("running with empty skip_av_seeds") - return skip_av_seeds - rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) - skip_av_seeds_from_file = get_skip_av_seeds() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, - skip_av_seeds=skip_av_seeds_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index 479dfa7..5f6fbd0 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -54,7 +54,6 @@ class BrozzlerWorker: self, frontier, service_registry=None, - skip_av_seeds=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -74,7 +73,6 @@ class BrozzlerWorker: ): self._frontier = frontier self._service_registry = service_registry - self._skip_av_seeds = skip_av_seeds self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -262,7 +260,7 @@ class BrozzlerWorker: self.logger.info("page interstitial shown (http auth): %s", page) if enable_youtube_dl and ydl.should_ytdlp( - site, page, browser.websock_thread.page_status, self._skip_av_seeds + site, page, browser.websock_thread.page_status ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b292129..7161294 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -32,7 +32,7 @@ import threading thread_local = threading.local() -def should_ytdlp(site, page, page_status, skip_av_seeds): +def should_ytdlp(site, page, page_status): # called only after we've passed needs_browsing() check if page_status != 200: @@ -47,21 +47,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds): if "chrome-error:" in ytdlp_url: return False - ytdlp_seed = ( - site["metadata"]["ait_seed_id"] - if "metadata" in site and "ait_seed_id" in site["metadata"] - else None - ) - - # TODO: develop UI and refactor - if ytdlp_seed: - if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: - logging.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - else: - site.skip_ytdlp = False - return True From eb227b0d4e6ebe4d8b2f64d8ae4df49e1158e89e Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Thu, 12 Sep 2024 16:51:48 -0700 Subject: [PATCH 02/12] WT-2950 replace skip_ytdlp with video_capture --- brozzler/model.py | 4 ++-- brozzler/ydl.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index fe9f8c0..6646aef 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -235,8 +235,8 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} - if not "skip_ytdlp" in self: - self.skip_ytdlp = None + if "video_capture" not in self: + self.video_capture = "ENABLE_VIDEO_CAPTURE" # backward compatibility if "surt" in self.scope: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7161294..a587d2e 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -38,8 +38,8 @@ def should_ytdlp(site, page, page_status): if page_status != 200: logging.info("skipping ytdlp: non-200 page status %s", page_status) return False - if site.skip_ytdlp: - logging.info("skipping ytdlp: site marked skip_ytdlp") + if site.video_capture != "ENABLE_VIDEO_CAPTURE": + logging.info("skipping ytdlp: site has video capture disabled") return False ytdlp_url = page.redirect_url if page.redirect_url else page.url From c3a92b102f71b51a8d20604d3fc9678e3c02d338 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Thu, 12 Sep 2024 16:54:00 -0700 Subject: [PATCH 03/12] WT-2950 invert conditionals to PEP8 preferred code style (E713) --- brozzler/model.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 6646aef..1603142 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -225,28 +225,28 @@ class Site(doublethink.Document, ElapsedMixIn): table = "sites" def populate_defaults(self): - if not "status" in self: + if "status" not in self: self.status = "ACTIVE" - if not "claimed" in self: + if "claimed" not in self: self.claimed = False - if not "last_disclaimed" in self: + if "last_disclaimed" not in self: self.last_disclaimed = brozzler.EPOCH_UTC - if not "last_claimed" in self: + if "last_claimed" not in self: self.last_claimed = brozzler.EPOCH_UTC - if not "scope" in self: + if "scope" not in self: self.scope = {} if "video_capture" not in self: self.video_capture = "ENABLE_VIDEO_CAPTURE" # backward compatibility if "surt" in self.scope: - if not "accepts" in self.scope: + if "accepts" not in self.scope: self.scope["accepts"] = [] self.scope["accepts"].append({"surt": self.scope["surt"]}) del self.scope["surt"] # backward compatibility - if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope: + if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope: self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] @@ -256,7 +256,7 @@ class Site(doublethink.Document, ElapsedMixIn): brozzler.site_surt_canon(self.seed).ssurt().decode("ascii") ) - if not "starts_and_stops" in self: + if "starts_and_stops" not in self: if self.get("start_time"): # backward compatibility self.starts_and_stops = [ {"start": self.get("start_time"), "stop": None} @@ -271,7 +271,7 @@ class Site(doublethink.Document, ElapsedMixIn): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) def _accept_ssurt_if_not_redundant(self, ssurt): - if not "accepts" in self.scope: + if "accepts" not in self.scope: self.scope["accepts"] = [] simple_rule_ssurts = ( rule["ssurt"] From c7225496978db23aa60502f4cd23b48b1b360951 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 13 Sep 2024 13:33:57 -0700 Subject: [PATCH 04/12] WT-2950 exclude video file types if site has disabled video capture --- brozzler/worker.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 5f6fbd0..e63b59c 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -248,7 +248,10 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - self._fetch_url(site, page=page) + if self._is_video_type(page_headers) and site.video_capture != "ENABLE_VIDEO_CAPTURE": + self.logger.info("skipping video content: video capture disabled for site") + else: + self._fetch_url(site, page=page) else: self.logger.info("needs browsing: %s", page) try: @@ -301,13 +304,11 @@ class BrozzlerWorker: self.logger.warning("Failed to get headers for %s: %s", page.url, e) return {} - def _needs_browsing(self, page_headers): - if ( - "content-type" in page_headers - and "html" not in page_headers["content-type"] - ): - return False - return True + def _needs_browsing(self, page_headers) -> bool: + return not bool("content-type" in page_headers and "html" not in page_headers["content-type"]) + + def _is_video_type(self, page_headers) -> bool: + return ("content-type" in page_headers and "video" in page_headers["content-type"]) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): From 77e6b9ed52209ce9e1355c346dd0452039fbcf95 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 13 Sep 2024 13:38:21 -0700 Subject: [PATCH 05/12] small ruff formatting pass --- brozzler/worker.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index e63b59c..a0a4ea4 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -248,8 +248,13 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - if self._is_video_type(page_headers) and site.video_capture != "ENABLE_VIDEO_CAPTURE": - self.logger.info("skipping video content: video capture disabled for site") + if ( + self._is_video_type(page_headers) + and site.video_capture != "ENABLE_VIDEO_CAPTURE" + ): + self.logger.info( + "skipping video content: video capture disabled for site" + ) else: self._fetch_url(site, page=page) else: @@ -305,10 +310,15 @@ class BrozzlerWorker: return {} def _needs_browsing(self, page_headers) -> bool: - return not bool("content-type" in page_headers and "html" not in page_headers["content-type"]) + return not bool( + "content-type" in page_headers + and "html" not in page_headers["content-type"] + ) def _is_video_type(self, page_headers) -> bool: - return ("content-type" in page_headers and "video" in page_headers["content-type"]) + return ( + "content-type" in page_headers and "video" in page_headers["content-type"] + ) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): From 66263f03711c28e4ffbedcff8f4202200bc0da2d Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 13 Sep 2024 14:21:59 -0700 Subject: [PATCH 06/12] WT-2950 video capture options enum --- brozzler/model.py | 7 ++++++- brozzler/worker.py | 3 ++- brozzler/ydl.py | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 1603142..9766913 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -34,6 +34,7 @@ import urllib import uuid import yaml import zlib +from enum import Enum from typing import Optional @@ -219,6 +220,10 @@ class Job(doublethink.Document, ElapsedMixIn): self.status = "FINISHED" self.starts_and_stops[-1]["stop"] = doublethink.utcnow() +class VideoCaptureOptions(Enum): + ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" + LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE" + DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE" class Site(doublethink.Document, ElapsedMixIn): logger = logging.getLogger(__module__ + "." + __qualname__) @@ -236,7 +241,7 @@ class Site(doublethink.Document, ElapsedMixIn): if "scope" not in self: self.scope = {} if "video_capture" not in self: - self.video_capture = "ENABLE_VIDEO_CAPTURE" + self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value # backward compatibility if "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index a0a4ea4..8601046 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -21,6 +21,7 @@ limitations under the License. import logging import brozzler import brozzler.browser +from brozzler.model import VideoCaptureOptions import threading import time import urllib.request @@ -250,7 +251,7 @@ class BrozzlerWorker: self.logger.info("needs fetch: %s", page) if ( self._is_video_type(page_headers) - and site.video_capture != "ENABLE_VIDEO_CAPTURE" + and site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value ): self.logger.info( "skipping video content: video capture disabled for site" diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a587d2e..c41a92b 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -20,6 +20,7 @@ import logging import yt_dlp from yt_dlp.utils import match_filter_func import brozzler +from brozzler.model import VideoCaptureOptions import urllib.request import tempfile import urlcanon @@ -38,7 +39,7 @@ def should_ytdlp(site, page, page_status): if page_status != 200: logging.info("skipping ytdlp: non-200 page status %s", page_status) return False - if site.video_capture != "ENABLE_VIDEO_CAPTURE": + if site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value: logging.info("skipping ytdlp: site has video capture disabled") return False From 8275f3ea1677b8b7ea2649b1a294b8b861bc64f7 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 13 Sep 2024 16:04:58 -0700 Subject: [PATCH 07/12] another tiny ruff format pass --- brozzler/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/brozzler/model.py b/brozzler/model.py index 9766913..ab802bb 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -220,11 +220,13 @@ class Job(doublethink.Document, ElapsedMixIn): self.status = "FINISHED" self.starts_and_stops[-1]["stop"] = doublethink.utcnow() + class VideoCaptureOptions(Enum): ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE" DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE" + class Site(doublethink.Document, ElapsedMixIn): logger = logging.getLogger(__module__ + "." + __qualname__) table = "sites" From dca9630982d049472d0354ff931555cc408d6aca Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 20 Sep 2024 14:17:07 -0700 Subject: [PATCH 08/12] WT-2950 cleaning up video capture options handling; PDFs only handling on outlinks --- brozzler/model.py | 15 ++++++++++++++- brozzler/worker.py | 26 +++++++++++++++++++++----- brozzler/ydl.py | 5 ++++- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index ab802bb..68ba037 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -222,8 +222,21 @@ class Job(doublethink.Document, ElapsedMixIn): class VideoCaptureOptions(Enum): + """ + Enumeration of possible values for the `video_capture` config key. + - ENABLE_VIDEO_CAPTURE: All video is captured. + - DISABLE_VIDEO_CAPTURE: No video is captured. + - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header + containing the word "video" is not captured. + - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + + Note: Ensuring full video MIME type blocking requires an entry in the + Warcprox-Meta header `mime-type-filters` key to fully block videos by + MIME type. + """ ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" - LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE" + DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE" + BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES" DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE" diff --git a/brozzler/worker.py b/brozzler/worker.py index 8601046..ae6c750 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -249,12 +249,16 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - if ( - self._is_video_type(page_headers) - and site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value - ): + if site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, + ] and self._is_video_type(page_headers): self.logger.info( - "skipping video content: video capture disabled for site" + "skipping video content: video MIME type capture disabled for site" + ) + elif site.pdfs_only and not self._is_pdf(page_headers): + self.logger.info( + "skipping non-PDF content: PDFs only option enabled" ) else: self._fetch_url(site, page=page) @@ -317,10 +321,22 @@ class BrozzlerWorker: ) def _is_video_type(self, page_headers) -> bool: + """ + Determines if the page's Content-Type header specifies that it contains + a video. + """ return ( "content-type" in page_headers and "video" in page_headers["content-type"] ) + def _is_pdf(self, page_headers) -> bool: + """ + Determinse if the page's Content-Type header specifies that it is a PDF. + """ + return ( + "content-type" in page_headers and "application/pdf" in page_headers["content-type"] + ) + def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index c41a92b..0ee82f6 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -39,7 +39,10 @@ def should_ytdlp(site, page, page_status): if page_status != 200: logging.info("skipping ytdlp: non-200 page status %s", page_status) return False - if site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value: + if site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value, + ]: logging.info("skipping ytdlp: site has video capture disabled") return False From 41aab1a0b027c02e0aba9d148969c09f8ff66513 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 20 Sep 2024 15:21:21 -0700 Subject: [PATCH 09/12] WT-2950 documentation + better conf handling + linting --- brozzler/model.py | 15 ++++++++++----- brozzler/worker.py | 7 +++---- job-conf.rst | 30 ++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 68ba037..fbdd6c7 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -101,6 +101,8 @@ def new_job(frontier, job_conf): job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] + if "pdfs_only" in job_conf: + job.pdfs_only = job_conf["pdfs_only"] job.save() sites = [] @@ -199,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn): def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" + if "pdfs_only" not in self: + self.pdfs_only = False if not "starts_and_stops" in self: if self.get("started"): # backward compatibility self.starts_and_stops = [ @@ -224,16 +228,17 @@ class Job(doublethink.Document, ElapsedMixIn): class VideoCaptureOptions(Enum): """ Enumeration of possible values for the `video_capture` config key. - - ENABLE_VIDEO_CAPTURE: All video is captured. - - DISABLE_VIDEO_CAPTURE: No video is captured. + - ENABLE_VIDEO_CAPTURE (default): All video is captured. + - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing the word "video" is not captured. - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. - Note: Ensuring full video MIME type blocking requires an entry in the - Warcprox-Meta header `mime-type-filters` key to fully block videos by - MIME type. + Note: Ensuring full video MIME type blocking requires an additional entry in the + Warcprox-Meta header `mime-type-filters` key. """ + ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE" BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES" diff --git a/brozzler/worker.py b/brozzler/worker.py index ae6c750..89179e7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -257,9 +257,7 @@ class BrozzlerWorker: "skipping video content: video MIME type capture disabled for site" ) elif site.pdfs_only and not self._is_pdf(page_headers): - self.logger.info( - "skipping non-PDF content: PDFs only option enabled" - ) + self.logger.info("skipping non-PDF content: PDFs only option enabled") else: self._fetch_url(site, page=page) else: @@ -334,7 +332,8 @@ class BrozzlerWorker: Determinse if the page's Content-Type header specifies that it is a PDF. """ return ( - "content-type" in page_headers and "application/pdf" in page_headers["content-type"] + "content-type" in page_headers + and "application/pdf" in page_headers["content-type"] ) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): diff --git a/job-conf.rst b/job-conf.rst index 08707b6..be5a7ae 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -107,6 +107,18 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled simultaneously across the cluster. Addresses the problem of a job with many seeds starving out other jobs. +``pdfs_only`` +~~~~~~~~~~~~~~~~~~~~~ ++---------+----------+-----------+ +| type | required | default | ++=========+==========+===========+ +| boolean | no | ``false`` | ++---------+----------+-----------+ +Limits capture to PDFs based on MIME type. This value will only impact +processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs +requires an additional entry in the Warcprox-Meta header ``mime-type-filters`` +key to fully block videos by MIME type. + ``seeds`` ~~~~~~~~~ +------------------------+----------+---------+ @@ -158,6 +170,24 @@ other fields like checkboxes and/or hidden fields, brozzler will leave the default values in place. Brozzler submits login forms after page load. Then brozzling proceeds as usual. +``video_capture`` +~~~~~~~~~~~~~~~~~ ++--------+----------+--------------------------+ +| type | required | default | ++========+==========+==========================+ +| string | yes | ``ENABLE_VIDEO_CAPTURE`` | ++--------+----------+--------------------------+ +Determines the level of video capture for the seed. This is an enumeration with four possible values: +* ENABLE_VIDEO_CAPTURE (default): All video is captured. +* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. +* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing + the word "video" is not captured. +* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + +*Note: Ensuring full video MIME type blocking requires an additional entry in +the Warcprox-Meta header `mime-type-filters` key.* + Seed-level / top-level settings ------------------------------- These are seed settings that can also be specified at the top level, in which From 6fdc2b9a57b7f189c927f03b4ec8641173502605 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Fri, 20 Sep 2024 17:16:07 -0700 Subject: [PATCH 10/12] WT-2950 fix RST formatting --- job-conf.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/job-conf.rst b/job-conf.rst index be5a7ae..9873e14 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -178,6 +178,7 @@ Then brozzling proceeds as usual. | string | yes | ``ENABLE_VIDEO_CAPTURE`` | +--------+----------+--------------------------+ Determines the level of video capture for the seed. This is an enumeration with four possible values: + * ENABLE_VIDEO_CAPTURE (default): All video is captured. * DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a combination of the next two values. From 12db06ae8cd1c689d18fab94e5d5c1310b3cb378 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Mon, 23 Sep 2024 16:38:35 -0700 Subject: [PATCH 11/12] WT-2950 fix typos --- brozzler/worker.py | 2 +- job-conf.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 89179e7..faa7081 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -329,7 +329,7 @@ class BrozzlerWorker: def _is_pdf(self, page_headers) -> bool: """ - Determinse if the page's Content-Type header specifies that it is a PDF. + Determines if the page's Content-Type header specifies that it is a PDF. """ return ( "content-type" in page_headers diff --git a/job-conf.rst b/job-conf.rst index 9873e14..d0428ca 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -117,7 +117,7 @@ seeds starving out other jobs. Limits capture to PDFs based on MIME type. This value will only impact processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs requires an additional entry in the Warcprox-Meta header ``mime-type-filters`` -key to fully block videos by MIME type. +key. ``seeds`` ~~~~~~~~~ From 36b17d2a668cc8e61de17bea73eb3c1fdef00164 Mon Sep 17 00:00:00 2001 From: Gretchen Miller Date: Mon, 30 Sep 2024 16:55:20 -0700 Subject: [PATCH 12/12] WT2590 addressing PR feedback --- brozzler/worker.py | 6 +++--- job-conf.rst | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index faa7081..8416ec0 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -249,15 +249,15 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - if site.video_capture in [ + if site.pdfs_only and not self._is_pdf(page_headers): + self.logger.info("skipping non-PDF content: PDFs only option enabled") + elif site.video_capture in [ VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, ] and self._is_video_type(page_headers): self.logger.info( "skipping video content: video MIME type capture disabled for site" ) - elif site.pdfs_only and not self._is_pdf(page_headers): - self.logger.info("skipping non-PDF content: PDFs only option enabled") else: self._fetch_url(site, page=page) else: diff --git a/job-conf.rst b/job-conf.rst index d0428ca..5378bac 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -114,10 +114,12 @@ seeds starving out other jobs. +=========+==========+===========+ | boolean | no | ``false`` | +---------+----------+-----------+ -Limits capture to PDFs based on MIME type. This value will only impact -processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs -requires an additional entry in the Warcprox-Meta header ``mime-type-filters`` -key. +Limits capture to PDFs based on the MIME type set in the HTTP response's +Content-Type header. This value only impacts processing of outlinks within +Brozzler. + +*Note: Ensuring comprehensive limiting to only PDFs requires an additional +entry in the Warcprox-Meta header `mime-type-filters` key.* ``seeds`` ~~~~~~~~~