diff --git a/brozzler/cli.py b/brozzler/cli.py index 653e16a..3927bfe 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -585,27 +585,12 @@ def brozzler_worker(argv=None): finally: signal.signal(signal.SIGQUIT, dump_state) - def get_skip_av_seeds(): - # TODO: develop UI and refactor - SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt" - try: - # make set from seed IDs in SKIP_AV_SEEDS_FILE - with open(SKIP_AV_SEEDS_FILE) as skips: - skip_av_seeds = {int(l) for l in skips.readlines()} - logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE) - except Exception as e: - skip_av_seeds = set() - logging.info("running with empty skip_av_seeds") - return skip_av_seeds - rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) - skip_av_seeds_from_file = get_skip_av_seeds() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, - skip_av_seeds=skip_av_seeds_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/model.py b/brozzler/model.py index fe9f8c0..fbdd6c7 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -34,6 +34,7 @@ import urllib import uuid import yaml import zlib +from enum import Enum from typing import Optional @@ -100,6 +101,8 @@ def new_job(frontier, job_conf): job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] + if "pdfs_only" in job_conf: + job.pdfs_only = job_conf["pdfs_only"] job.save() sites = [] @@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn): def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" + if "pdfs_only" not in self: + self.pdfs_only = False if not "starts_and_stops" in self: if self.get("started"): # backward compatibility self.starts_and_stops = [ @@ -220,33 +225,53 @@ class Job(doublethink.Document, ElapsedMixIn): self.starts_and_stops[-1]["stop"] = doublethink.utcnow() +class VideoCaptureOptions(Enum): + """ + Enumeration of possible values for the `video_capture` config key. + - ENABLE_VIDEO_CAPTURE (default): All video is captured. + - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. + - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header + containing the word "video" is not captured. + - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + + Note: Ensuring full video MIME type blocking requires an additional entry in the + Warcprox-Meta header `mime-type-filters` key. + """ + + ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" + DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE" + BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES" + DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE" + + class Site(doublethink.Document, ElapsedMixIn): logger = logging.getLogger(__module__ + "." + __qualname__) table = "sites" def populate_defaults(self): - if not "status" in self: + if "status" not in self: self.status = "ACTIVE" - if not "claimed" in self: + if "claimed" not in self: self.claimed = False - if not "last_disclaimed" in self: + if "last_disclaimed" not in self: self.last_disclaimed = brozzler.EPOCH_UTC - if not "last_claimed" in self: + if "last_claimed" not in self: self.last_claimed = brozzler.EPOCH_UTC - if not "scope" in self: + if "scope" not in self: self.scope = {} - if not "skip_ytdlp" in self: - self.skip_ytdlp = None + if "video_capture" not in self: + self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value # backward compatibility if "surt" in self.scope: - if not "accepts" in self.scope: + if "accepts" not in self.scope: self.scope["accepts"] = [] self.scope["accepts"].append({"surt": self.scope["surt"]}) del self.scope["surt"] # backward compatibility - if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope: + if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope: self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] @@ -256,7 +281,7 @@ class Site(doublethink.Document, ElapsedMixIn): brozzler.site_surt_canon(self.seed).ssurt().decode("ascii") ) - if not "starts_and_stops" in self: + if "starts_and_stops" not in self: if self.get("start_time"): # backward compatibility self.starts_and_stops = [ {"start": self.get("start_time"), "stop": None} @@ -271,7 +296,7 @@ class Site(doublethink.Document, ElapsedMixIn): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) def _accept_ssurt_if_not_redundant(self, ssurt): - if not "accepts" in self.scope: + if "accepts" not in self.scope: self.scope["accepts"] = [] simple_rule_ssurts = ( rule["ssurt"] diff --git a/brozzler/worker.py b/brozzler/worker.py index 075844b..abe84de 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -21,6 +21,7 @@ limitations under the License. import logging import brozzler import brozzler.browser +from brozzler.model import VideoCaptureOptions import threading import time import urllib.request @@ -55,7 +56,6 @@ class BrozzlerWorker: self, frontier, service_registry=None, - skip_av_seeds=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -78,7 +78,6 @@ class BrozzlerWorker: ): self._frontier = frontier self._service_registry = service_registry - self._skip_av_seeds = skip_av_seeds self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -268,7 +267,17 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - self._fetch_url(site, page=page) + if site.pdfs_only and not self._is_pdf(page_headers): + self.logger.info("skipping non-PDF content: PDFs only option enabled") + elif site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, + ] and self._is_video_type(page_headers): + self.logger.info( + "skipping video content: video MIME type capture disabled for site" + ) + else: + self._fetch_url(site, page=page) else: self.logger.info("needs browsing: %s", page) try: @@ -280,7 +289,7 @@ class BrozzlerWorker: self.logger.info("page interstitial shown (http auth): %s", page) if enable_youtube_dl and ydl.should_ytdlp( - site, page, browser.websock_thread.page_status, self._skip_av_seeds + site, page, browser.websock_thread.page_status ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) @@ -327,13 +336,29 @@ class BrozzlerWorker: self.logger.warning("Failed to get headers for %s: %s", page.url, e) return {} - def _needs_browsing(self, page_headers): - if ( + def _needs_browsing(self, page_headers) -> bool: + return not bool( "content-type" in page_headers and "html" not in page_headers["content-type"] - ): - return False - return True + ) + + def _is_video_type(self, page_headers) -> bool: + """ + Determines if the page's Content-Type header specifies that it contains + a video. + """ + return ( + "content-type" in page_headers and "video" in page_headers["content-type"] + ) + + def _is_pdf(self, page_headers) -> bool: + """ + Determines if the page's Content-Type header specifies that it is a PDF. + """ + return ( + "content-type" in page_headers + and "application/pdf" in page_headers["content-type"] + ) @metrics.brozzler_page_processing_duration_seconds.time() @metrics.brozzler_in_progress_pages.track_inprogress() diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 8691351..d789892 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -20,6 +20,7 @@ import logging import yt_dlp from yt_dlp.utils import match_filter_func import brozzler +from brozzler.model import VideoCaptureOptions import urllib.request import tempfile import urlcanon @@ -58,14 +59,17 @@ def _timestamp4datetime(timestamp): int(timestamp[-2:]) ) -def should_ytdlp(site, page, page_status, skip_av_seeds): +def should_ytdlp(site, page, page_status): # called only after we've passed needs_browsing() check if page_status != 200: logging.info("skipping ytdlp: non-200 page status %s", page_status) return False - if site.skip_ytdlp: - logging.info("skipping ytdlp: site marked skip_ytdlp") + if site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value, + ]: + logging.info("skipping ytdlp: site has video capture disabled") return False ytdlp_url = page.redirect_url if page.redirect_url else page.url @@ -73,49 +77,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds): if "chrome-error:" in ytdlp_url: return False - ytdlp_seed = ( - site["metadata"]["ait_seed_id"] - if "metadata" in site and "ait_seed_id" in site["metadata"] - else None - ) - - # TODO: develop UI and refactor - if ytdlp_seed: - if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: - logging.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - else: - site.skip_ytdlp = False - - logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed) - - if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url: - logging.info("found youtube watch page %r", ytdlp_url) - # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml - cluster = Cluster(["207.241.235.189"], protocol_version=5) - session = cluster.connect("video") - containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1" - future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", str(urlcanon.aggressive(ytdlp_url))]) - try: - rows = future.result() - except ReadTimeout: - logging.exception("Query timed out:") - - if len(rows.current_rows) == 0: - logging.info("no results returned from videos query") - return True - - for row in rows: - logging.info("video query found %r", row) - ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp)) - logging.info("ytdlp_timestamp: %s", ytdlp_timestamp) - time_diff = datetime.datetime.now() - ytdlp_timestamp - # TODO: make variable for timedelta - if time_diff < datetime.timedelta(days = 90): - logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url) - return False - return True def isyoutubehost(url): diff --git a/job-conf.rst b/job-conf.rst index 08707b6..5378bac 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled simultaneously across the cluster. Addresses the problem of a job with many seeds starving out other jobs. +``pdfs_only`` +~~~~~~~~~~~~~~~~~~~~~ ++---------+----------+-----------+ +| type | required | default | ++=========+==========+===========+ +| boolean | no | ``false`` | ++---------+----------+-----------+ +Limits capture to PDFs based on the MIME type set in the HTTP response's +Content-Type header. This value only impacts processing of outlinks within +Brozzler. + +*Note: Ensuring comprehensive limiting to only PDFs requires an additional +entry in the Warcprox-Meta header `mime-type-filters` key.* + ``seeds`` ~~~~~~~~~ +------------------------+----------+---------+ @@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave the default values in place. Brozzler submits login forms after page load. Then brozzling proceeds as usual. +``video_capture`` +~~~~~~~~~~~~~~~~~ ++--------+----------+--------------------------+ +| type | required | default | ++========+==========+==========================+ +| string | yes | ``ENABLE_VIDEO_CAPTURE`` | ++--------+----------+--------------------------+ +Determines the level of video capture for the seed. This is an enumeration with four possible values: + +* ENABLE_VIDEO_CAPTURE (default): All video is captured. +* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. +* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing + the word "video" is not captured. +* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + +*Note: Ensuring full video MIME type blocking requires an additional entry in +the Warcprox-Meta header `mime-type-filters` key.* + Seed-level / top-level settings ------------------------------- These are seed settings that can also be specified at the top level, in which