diff --git a/brozzler/model.py b/brozzler/model.py index ab802bb..68ba037 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -222,8 +222,21 @@ class Job(doublethink.Document, ElapsedMixIn): class VideoCaptureOptions(Enum): + """ + Enumeration of possible values for the `video_capture` config key. + - ENABLE_VIDEO_CAPTURE: All video is captured. + - DISABLE_VIDEO_CAPTURE: No video is captured. + - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header + containing the word "video" is not captured. + - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + + Note: Ensuring full video MIME type blocking requires an entry in the + Warcprox-Meta header `mime-type-filters` key to fully block videos by + MIME type. + """ ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" - LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE" + DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE" + BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES" DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE" diff --git a/brozzler/worker.py b/brozzler/worker.py index 8601046..ae6c750 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -249,12 +249,16 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - if ( - self._is_video_type(page_headers) - and site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value - ): + if site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, + ] and self._is_video_type(page_headers): self.logger.info( - "skipping video content: video capture disabled for site" + "skipping video content: video MIME type capture disabled for site" + ) + elif site.pdfs_only and not self._is_pdf(page_headers): + self.logger.info( + "skipping non-PDF content: PDFs only option enabled" ) else: self._fetch_url(site, page=page) @@ -317,10 +321,22 @@ class BrozzlerWorker: ) def _is_video_type(self, page_headers) -> bool: + """ + Determines if the page's Content-Type header specifies that it contains + a video. + """ return ( "content-type" in page_headers and "video" in page_headers["content-type"] ) + def _is_pdf(self, page_headers) -> bool: + """ + Determinse if the page's Content-Type header specifies that it is a PDF. + """ + return ( + "content-type" in page_headers and "application/pdf" in page_headers["content-type"] + ) + def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index c41a92b..0ee82f6 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -39,7 +39,10 @@ def should_ytdlp(site, page, page_status): if page_status != 200: logging.info("skipping ytdlp: non-200 page status %s", page_status) return False - if site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value: + if site.video_capture in [ + VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, + VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value, + ]: logging.info("skipping ytdlp: site has video capture disabled") return False