WT-2950 cleaning up video capture options handling; PDFs only handling on outlinks

This commit is contained in:
Gretchen Miller 2024-09-20 14:17:07 -07:00
parent 8275f3ea16
commit dca9630982
3 changed files with 39 additions and 7 deletions

View File

@ -222,8 +222,21 @@ class Job(doublethink.Document, ElapsedMixIn):
class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE: All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
Note: Ensuring full video MIME type blocking requires an entry in the
Warcprox-Meta header `mime-type-filters` key to fully block videos by
MIME type.
"""
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"

View File

@ -249,12 +249,16 @@ class BrozzlerWorker:
if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
if (
self._is_video_type(page_headers)
and site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
):
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
"skipping video content: video capture disabled for site"
"skipping video content: video MIME type capture disabled for site"
)
elif site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info(
"skipping non-PDF content: PDFs only option enabled"
)
else:
self._fetch_url(site, page=page)
@ -317,10 +321,22 @@ class BrozzlerWorker:
)
def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)
def _is_pdf(self, page_headers) -> bool:
"""
Determinse if the page's Content-Type header specifies that it is a PDF.
"""
return (
"content-type" in page_headers and "application/pdf" in page_headers["content-type"]
)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_jpeg):
if on_screenshot:

View File

@ -39,7 +39,10 @@ def should_ytdlp(site, page, page_status):
if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
return False
if site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value:
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
return False