mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
Merge a6c827ebb9ddb447c49032ac4be1b6c7bcd62f26 into 42b4a88c963eb480b3c19117c19eac7e146fa8ff
This commit is contained in:
commit
26abedb28b
@ -665,21 +665,6 @@ def brozzler_worker(argv=None):
|
||||
finally:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
def get_skip_av_seeds():
|
||||
# TODO: develop UI and refactor
|
||||
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
|
||||
try:
|
||||
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
||||
with open(SKIP_AV_SEEDS_FILE) as skips:
|
||||
skip_av_seeds = {int(line) for line in skips.readlines()}
|
||||
logger.info(
|
||||
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
|
||||
)
|
||||
except Exception:
|
||||
skip_av_seeds = set()
|
||||
logger.info("running with empty skip_av_seeds")
|
||||
return skip_av_seeds
|
||||
|
||||
def get_ytdlp_proxy_endpoints():
|
||||
YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file
|
||||
try:
|
||||
@ -699,12 +684,10 @@ def brozzler_worker(argv=None):
|
||||
rr = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
skip_av_seeds_from_file = get_skip_av_seeds()
|
||||
ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints()
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier,
|
||||
service_registry,
|
||||
skip_av_seeds=skip_av_seeds_from_file,
|
||||
ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file,
|
||||
max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe,
|
||||
|
@ -95,8 +95,13 @@ seeds:
|
||||
password:
|
||||
type: string
|
||||
|
||||
video_capture:
|
||||
type: string
|
||||
|
||||
<<: *multi_level_options
|
||||
|
||||
max_claimed_sites:
|
||||
type: integer
|
||||
|
||||
pdfs_only:
|
||||
type: boolean
|
||||
|
@ -25,6 +25,7 @@ import os
|
||||
import urllib
|
||||
import uuid
|
||||
import zlib
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
import cerberus
|
||||
@ -101,6 +102,8 @@ def new_job(frontier, job_conf):
|
||||
job.id = job_conf["id"]
|
||||
if "max_claimed_sites" in job_conf:
|
||||
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
||||
if "pdfs_only" in job_conf:
|
||||
job.pdfs_only = job_conf["pdfs_only"]
|
||||
job.save()
|
||||
|
||||
sites = []
|
||||
@ -199,6 +202,8 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
def populate_defaults(self):
|
||||
if "status" not in self:
|
||||
self.status = "ACTIVE"
|
||||
if "pdfs_only" not in self:
|
||||
self.pdfs_only = False
|
||||
if "starts_and_stops" not in self:
|
||||
if self.get("started"): # backward compatibility
|
||||
self.starts_and_stops = [
|
||||
@ -221,6 +226,26 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
|
||||
|
||||
|
||||
class VideoCaptureOptions(Enum):
|
||||
"""
|
||||
Enumeration of possible values for the `video_capture` config key.
|
||||
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
|
||||
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
|
||||
combination of the next two values.
|
||||
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
|
||||
containing the word "video" is not captured.
|
||||
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||
|
||||
Note: Ensuring full video MIME type blocking requires an additional entry in the
|
||||
Warcprox-Meta header `mime-type-filters` key.
|
||||
"""
|
||||
|
||||
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
|
||||
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
|
||||
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
|
||||
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
|
||||
|
||||
|
||||
class Site(doublethink.Document, ElapsedMixIn):
|
||||
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
|
||||
table = "sites"
|
||||
@ -236,8 +261,8 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if "scope" not in self:
|
||||
self.scope = {}
|
||||
if "skip_ytdlp" not in self:
|
||||
self.skip_ytdlp = None
|
||||
if "video_capture" not in self:
|
||||
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
|
||||
|
||||
# backward compatibility
|
||||
if "surt" in self.scope:
|
||||
|
@ -38,6 +38,7 @@ from urllib3.exceptions import ProxyError, TimeoutError
|
||||
|
||||
import brozzler
|
||||
import brozzler.browser
|
||||
from brozzler.model import VideoCaptureOptions
|
||||
|
||||
from . import metrics
|
||||
|
||||
@ -60,7 +61,6 @@ class BrozzlerWorker:
|
||||
self,
|
||||
frontier,
|
||||
service_registry=None,
|
||||
skip_av_seeds=None,
|
||||
ytdlp_proxy_endpoints=None,
|
||||
max_browsers=1,
|
||||
chrome_exe="chromium-browser",
|
||||
@ -86,7 +86,6 @@ class BrozzlerWorker:
|
||||
):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._skip_av_seeds = skip_av_seeds
|
||||
self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints
|
||||
self._max_browsers = max_browsers
|
||||
|
||||
@ -278,14 +277,17 @@ class BrozzlerWorker:
|
||||
img.save(out, "jpeg", quality=95)
|
||||
return out.getbuffer()
|
||||
|
||||
def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
|
||||
def should_ytdlp(self, logger, site, page, page_status):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page_status != 200:
|
||||
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logger.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
if site.video_capture in [
|
||||
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
|
||||
]:
|
||||
logger.info("skipping ytdlp: site has video capture disabled")
|
||||
return False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
@ -293,21 +295,6 @@ class BrozzlerWorker:
|
||||
if "chrome-error:" in ytdlp_url:
|
||||
return False
|
||||
|
||||
ytdlp_seed = (
|
||||
site["metadata"]["ait_seed_id"]
|
||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
||||
else None
|
||||
)
|
||||
|
||||
# TODO: develop UI and refactor
|
||||
if ytdlp_seed:
|
||||
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
|
||||
logger.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
else:
|
||||
site.skip_ytdlp = False
|
||||
|
||||
return True
|
||||
|
||||
@metrics.brozzler_page_processing_duration_seconds.time()
|
||||
@ -329,7 +316,17 @@ class BrozzlerWorker:
|
||||
|
||||
if not self._needs_browsing(page_headers):
|
||||
page_logger.info("needs fetch")
|
||||
self._fetch_url(site, page=page)
|
||||
if site.pdfs_only and not self._is_pdf(page_headers):
|
||||
page_logger.info("skipping non-PDF content: PDFs only option enabled")
|
||||
elif site.video_capture in [
|
||||
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
|
||||
] and self._is_video_type(page_headers):
|
||||
page_logger.info(
|
||||
"skipping video content: video MIME type capture disabled for site"
|
||||
)
|
||||
else:
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
page_logger.info("needs browsing")
|
||||
try:
|
||||
@ -344,7 +341,7 @@ class BrozzlerWorker:
|
||||
page_logger.info("page interstitial shown (http auth)")
|
||||
|
||||
if enable_youtube_dl and self.should_ytdlp(
|
||||
page_logger, site, page, status_code, self._skip_av_seeds
|
||||
page_logger, site, page, status_code
|
||||
):
|
||||
try:
|
||||
from . import ydl
|
||||
@ -403,13 +400,29 @@ class BrozzlerWorker:
|
||||
url_logger.warning("Failed to get headers", exc_info=True)
|
||||
return {}
|
||||
|
||||
def _needs_browsing(self, page_headers):
|
||||
if (
|
||||
def _needs_browsing(self, page_headers) -> bool:
|
||||
return not (
|
||||
"content-type" in page_headers
|
||||
and "html" not in page_headers["content-type"]
|
||||
):
|
||||
return False
|
||||
return True
|
||||
)
|
||||
|
||||
def _is_video_type(self, page_headers) -> bool:
|
||||
"""
|
||||
Determines if the page's Content-Type header specifies that it contains
|
||||
a video.
|
||||
"""
|
||||
return (
|
||||
"content-type" in page_headers and "video" in page_headers["content-type"]
|
||||
)
|
||||
|
||||
def _is_pdf(self, page_headers) -> bool:
|
||||
"""
|
||||
Determines if the page's Content-Type header specifies that it is a PDF.
|
||||
"""
|
||||
return (
|
||||
"content-type" in page_headers
|
||||
and "application/pdf" in page_headers["content-type"]
|
||||
)
|
||||
|
||||
@metrics.brozzler_browsing_duration_seconds.time()
|
||||
@metrics.brozzler_in_progress_browses.track_inprogress()
|
||||
|
0
dev/pre-commit
Normal file → Executable file
0
dev/pre-commit
Normal file → Executable file
33
job-conf.rst
33
job-conf.rst
@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
|
||||
simultaneously across the cluster. Addresses the problem of a job with many
|
||||
seeds starving out other jobs.
|
||||
|
||||
``pdfs_only``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
+---------+----------+-----------+
|
||||
| type | required | default |
|
||||
+=========+==========+===========+
|
||||
| boolean | no | ``false`` |
|
||||
+---------+----------+-----------+
|
||||
Limits capture to PDFs based on the MIME type set in the HTTP response's
|
||||
Content-Type header. This value only impacts processing of outlinks within
|
||||
Brozzler.
|
||||
|
||||
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
|
||||
entry in the Warcprox-Meta header `mime-type-filters` key.*
|
||||
|
||||
``seeds``
|
||||
~~~~~~~~~
|
||||
+------------------------+----------+---------+
|
||||
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
|
||||
the default values in place. Brozzler submits login forms after page load.
|
||||
Then brozzling proceeds as usual.
|
||||
|
||||
``video_capture``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
+--------+----------+--------------------------+
|
||||
| type | required | default |
|
||||
+========+==========+==========================+
|
||||
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
|
||||
+--------+----------+--------------------------+
|
||||
Determines the level of video capture for the seed. This is an enumeration with four possible values:
|
||||
|
||||
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
|
||||
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
|
||||
combination of the next two values.
|
||||
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
|
||||
the word "video" is not captured.
|
||||
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||
|
||||
*Note: Ensuring full video MIME type blocking requires an additional entry in
|
||||
the Warcprox-Meta header `mime-type-filters` key.*
|
||||
|
||||
Seed-level / top-level settings
|
||||
-------------------------------
|
||||
These are seed settings that can also be specified at the top level, in which
|
||||
|
Loading…
x
Reference in New Issue
Block a user