Merge a6c827ebb9ddb447c49032ac4be1b6c7bcd62f26 into 42b4a88c963eb480b3c19117c19eac7e146fa8ff

This commit is contained in:
Gretchen Leigh Miller 2025-04-05 22:39:21 +00:00 committed by GitHub
commit 26abedb28b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 105 additions and 46 deletions

View File

@ -665,21 +665,6 @@ def brozzler_worker(argv=None):
finally:
signal.signal(signal.SIGQUIT, dump_state)
def get_skip_av_seeds():
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(line) for line in skips.readlines()}
logger.info(
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
)
except Exception:
skip_av_seeds = set()
logger.info("running with empty skip_av_seeds")
return skip_av_seeds
def get_ytdlp_proxy_endpoints():
YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file
try:
@ -699,12 +684,10 @@ def brozzler_worker(argv=None):
rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,

View File

@ -95,8 +95,13 @@ seeds:
password:
type: string
video_capture:
type: string
<<: *multi_level_options
max_claimed_sites:
type: integer
pdfs_only:
type: boolean

View File

@ -25,6 +25,7 @@ import os
import urllib
import uuid
import zlib
from enum import Enum
from typing import Optional
import cerberus
@ -101,6 +102,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save()
sites = []
@ -199,6 +202,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self):
if "status" not in self:
self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if "starts_and_stops" not in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
@ -221,6 +226,26 @@ class Job(doublethink.Document, ElapsedMixIn):
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
class Site(doublethink.Document, ElapsedMixIn):
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
table = "sites"
@ -236,8 +261,8 @@ class Site(doublethink.Document, ElapsedMixIn):
self.last_claimed = brozzler.EPOCH_UTC
if "scope" not in self:
self.scope = {}
if "skip_ytdlp" not in self:
self.skip_ytdlp = None
if "video_capture" not in self:
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
# backward compatibility
if "surt" in self.scope:

View File

@ -38,6 +38,7 @@ from urllib3.exceptions import ProxyError, TimeoutError
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
from . import metrics
@ -60,7 +61,6 @@ class BrozzlerWorker:
self,
frontier,
service_registry=None,
skip_av_seeds=None,
ytdlp_proxy_endpoints=None,
max_browsers=1,
chrome_exe="chromium-browser",
@ -86,7 +86,6 @@ class BrozzlerWorker:
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints
self._max_browsers = max_browsers
@ -278,14 +277,17 @@ class BrozzlerWorker:
img.save(out, "jpeg", quality=95)
return out.getbuffer()
def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
def should_ytdlp(self, logger, site, page, page_status):
# called only after we've passed needs_browsing() check
if page_status != 200:
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
return False
if site.skip_ytdlp:
logger.info("skipping ytdlp: site marked skip_ytdlp")
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logger.info("skipping ytdlp: site has video capture disabled")
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -293,21 +295,6 @@ class BrozzlerWorker:
if "chrome-error:" in ytdlp_url:
return False
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logger.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False
return True
@metrics.brozzler_page_processing_duration_seconds.time()
@ -329,7 +316,17 @@ class BrozzlerWorker:
if not self._needs_browsing(page_headers):
page_logger.info("needs fetch")
self._fetch_url(site, page=page)
if site.pdfs_only and not self._is_pdf(page_headers):
page_logger.info("skipping non-PDF content: PDFs only option enabled")
elif site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
page_logger.info(
"skipping video content: video MIME type capture disabled for site"
)
else:
self._fetch_url(site, page=page)
else:
page_logger.info("needs browsing")
try:
@ -344,7 +341,7 @@ class BrozzlerWorker:
page_logger.info("page interstitial shown (http auth)")
if enable_youtube_dl and self.should_ytdlp(
page_logger, site, page, status_code, self._skip_av_seeds
page_logger, site, page, status_code
):
try:
from . import ydl
@ -403,13 +400,29 @@ class BrozzlerWorker:
url_logger.warning("Failed to get headers", exc_info=True)
return {}
def _needs_browsing(self, page_headers):
if (
def _needs_browsing(self, page_headers) -> bool:
return not (
"content-type" in page_headers
and "html" not in page_headers["content-type"]
):
return False
return True
)
def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)
def _is_pdf(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it is a PDF.
"""
return (
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)
@metrics.brozzler_browsing_duration_seconds.time()
@metrics.brozzler_in_progress_browses.track_inprogress()

0
dev/pre-commit Normal file → Executable file
View File

View File

@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs.
``pdfs_only``
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on the MIME type set in the HTTP response's
Content-Type header. This value only impacts processing of outlinks within
Brozzler.
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
entry in the Warcprox-Meta header `mime-type-filters` key.*
``seeds``
~~~~~~~~~
+------------------------+----------+---------+
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual.
``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*
Seed-level / top-level settings
-------------------------------
These are seed settings that can also be specified at the top level, in which