mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
WT-2950 cleaning up video capture options handling; PDFs only handling on outlinks
This commit is contained in:
parent
8275f3ea16
commit
dca9630982
@ -222,8 +222,21 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
|
||||
|
||||
class VideoCaptureOptions(Enum):
|
||||
"""
|
||||
Enumeration of possible values for the `video_capture` config key.
|
||||
- ENABLE_VIDEO_CAPTURE: All video is captured.
|
||||
- DISABLE_VIDEO_CAPTURE: No video is captured.
|
||||
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
|
||||
containing the word "video" is not captured.
|
||||
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||
|
||||
Note: Ensuring full video MIME type blocking requires an entry in the
|
||||
Warcprox-Meta header `mime-type-filters` key to fully block videos by
|
||||
MIME type.
|
||||
"""
|
||||
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
|
||||
LIMIT_VIDEO_CAPTURE = "LIMIT_VIDEO_CAPTURE"
|
||||
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
|
||||
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
|
||||
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
|
||||
|
||||
|
||||
|
@ -249,12 +249,16 @@ class BrozzlerWorker:
|
||||
|
||||
if not self._needs_browsing(page_headers):
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
if (
|
||||
self._is_video_type(page_headers)
|
||||
and site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
|
||||
):
|
||||
if site.video_capture in [
|
||||
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
|
||||
] and self._is_video_type(page_headers):
|
||||
self.logger.info(
|
||||
"skipping video content: video capture disabled for site"
|
||||
"skipping video content: video MIME type capture disabled for site"
|
||||
)
|
||||
elif site.pdfs_only and not self._is_pdf(page_headers):
|
||||
self.logger.info(
|
||||
"skipping non-PDF content: PDFs only option enabled"
|
||||
)
|
||||
else:
|
||||
self._fetch_url(site, page=page)
|
||||
@ -317,10 +321,22 @@ class BrozzlerWorker:
|
||||
)
|
||||
|
||||
def _is_video_type(self, page_headers) -> bool:
|
||||
"""
|
||||
Determines if the page's Content-Type header specifies that it contains
|
||||
a video.
|
||||
"""
|
||||
return (
|
||||
"content-type" in page_headers and "video" in page_headers["content-type"]
|
||||
)
|
||||
|
||||
def _is_pdf(self, page_headers) -> bool:
|
||||
"""
|
||||
Determinse if the page's Content-Type header specifies that it is a PDF.
|
||||
"""
|
||||
return (
|
||||
"content-type" in page_headers and "application/pdf" in page_headers["content-type"]
|
||||
)
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
def _on_screenshot(screenshot_jpeg):
|
||||
if on_screenshot:
|
||||
|
@ -39,7 +39,10 @@ def should_ytdlp(site, page, page_status):
|
||||
if page_status != 200:
|
||||
logging.info("skipping ytdlp: non-200 page status %s", page_status)
|
||||
return False
|
||||
if site.video_capture != VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value:
|
||||
if site.video_capture in [
|
||||
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
|
||||
]:
|
||||
logging.info("skipping ytdlp: site has video capture disabled")
|
||||
return False
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user