Merge branch 'gmiller/2950-skip-ytdlp' into qa

This commit is contained in:
Gretchen Miller 2024-09-30 17:01:04 -07:00
commit 04c00d21c5
5 changed files with 110 additions and 81 deletions

View File

@ -585,27 +585,12 @@ def brozzler_worker(argv=None):
finally: finally:
signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGQUIT, dump_state)
def get_skip_av_seeds():
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds
rr = rethinker(args) rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr) service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker( worker = brozzler.worker.BrozzlerWorker(
frontier, frontier,
service_registry, service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers), max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe, chrome_exe=args.chrome_exe,
proxy=args.proxy, proxy=args.proxy,

View File

@ -34,6 +34,7 @@ import urllib
import uuid import uuid
import yaml import yaml
import zlib import zlib
from enum import Enum
from typing import Optional from typing import Optional
@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"] job.id = job_conf["id"]
if "max_claimed_sites" in job_conf: if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"] job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save() job.save()
sites = [] sites = []
@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self): def populate_defaults(self):
if not "status" in self: if not "status" in self:
self.status = "ACTIVE" self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if not "starts_and_stops" in self: if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility if self.get("started"): # backward compatibility
self.starts_and_stops = [ self.starts_and_stops = [
@ -220,33 +225,53 @@ class Job(doublethink.Document, ElapsedMixIn):
self.starts_and_stops[-1]["stop"] = doublethink.utcnow() self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
class Site(doublethink.Document, ElapsedMixIn): class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
table = "sites" table = "sites"
def populate_defaults(self): def populate_defaults(self):
if not "status" in self: if "status" not in self:
self.status = "ACTIVE" self.status = "ACTIVE"
if not "claimed" in self: if "claimed" not in self:
self.claimed = False self.claimed = False
if not "last_disclaimed" in self: if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self: if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self: if "scope" not in self:
self.scope = {} self.scope = {}
if not "skip_ytdlp" in self: if "video_capture" not in self:
self.skip_ytdlp = None self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
# backward compatibility # backward compatibility
if "surt" in self.scope: if "surt" in self.scope:
if not "accepts" in self.scope: if "accepts" not in self.scope:
self.scope["accepts"] = [] self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]}) self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"] del self.scope["surt"]
# backward compatibility # backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope: if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope: if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"] del self.scope["max_hops_off_surt"]
@ -256,7 +281,7 @@ class Site(doublethink.Document, ElapsedMixIn):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii") brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
) )
if not "starts_and_stops" in self: if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility if self.get("start_time"): # backward compatibility
self.starts_and_stops = [ self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None} {"start": self.get("start_time"), "stop": None}
@ -271,7 +296,7 @@ class Site(doublethink.Document, ElapsedMixIn):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
def _accept_ssurt_if_not_redundant(self, ssurt): def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope: if "accepts" not in self.scope:
self.scope["accepts"] = [] self.scope["accepts"] = []
simple_rule_ssurts = ( simple_rule_ssurts = (
rule["ssurt"] rule["ssurt"]

View File

@ -21,6 +21,7 @@ limitations under the License.
import logging import logging
import brozzler import brozzler
import brozzler.browser import brozzler.browser
from brozzler.model import VideoCaptureOptions
import threading import threading
import time import time
import urllib.request import urllib.request
@ -55,7 +56,6 @@ class BrozzlerWorker:
self, self,
frontier, frontier,
service_registry=None, service_registry=None,
skip_av_seeds=None,
max_browsers=1, max_browsers=1,
chrome_exe="chromium-browser", chrome_exe="chromium-browser",
warcprox_auto=False, warcprox_auto=False,
@ -78,7 +78,6 @@ class BrozzlerWorker:
): ):
self._frontier = frontier self._frontier = frontier
self._service_registry = service_registry self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers self._max_browsers = max_browsers
self._warcprox_auto = warcprox_auto self._warcprox_auto = warcprox_auto
@ -268,7 +267,17 @@ class BrozzlerWorker:
if not self._needs_browsing(page_headers): if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page) self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page) if site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info("skipping non-PDF content: PDFs only option enabled")
elif site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
"skipping video content: video MIME type capture disabled for site"
)
else:
self._fetch_url(site, page=page)
else: else:
self.logger.info("needs browsing: %s", page) self.logger.info("needs browsing: %s", page)
try: try:
@ -280,7 +289,7 @@ class BrozzlerWorker:
self.logger.info("page interstitial shown (http auth): %s", page) self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp( if enable_youtube_dl and ydl.should_ytdlp(
site, page, browser.websock_thread.page_status, self._skip_av_seeds site, page, browser.websock_thread.page_status
): ):
try: try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page) ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@ -327,13 +336,29 @@ class BrozzlerWorker:
self.logger.warning("Failed to get headers for %s: %s", page.url, e) self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {} return {}
def _needs_browsing(self, page_headers): def _needs_browsing(self, page_headers) -> bool:
if ( return not bool(
"content-type" in page_headers "content-type" in page_headers
and "html" not in page_headers["content-type"] and "html" not in page_headers["content-type"]
): )
return False
return True def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)
def _is_pdf(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it is a PDF.
"""
return (
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)
@metrics.brozzler_page_processing_duration_seconds.time() @metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress() @metrics.brozzler_in_progress_pages.track_inprogress()

View File

@ -20,6 +20,7 @@ import logging
import yt_dlp import yt_dlp
from yt_dlp.utils import match_filter_func from yt_dlp.utils import match_filter_func
import brozzler import brozzler
from brozzler.model import VideoCaptureOptions
import urllib.request import urllib.request
import tempfile import tempfile
import urlcanon import urlcanon
@ -58,14 +59,17 @@ def _timestamp4datetime(timestamp):
int(timestamp[-2:]) int(timestamp[-2:])
) )
def should_ytdlp(site, page, page_status, skip_av_seeds): def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check # called only after we've passed needs_browsing() check
if page_status != 200: if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status) logging.info("skipping ytdlp: non-200 page status %s", page_status)
return False return False
if site.skip_ytdlp: if site.video_capture in [
logging.info("skipping ytdlp: site marked skip_ytdlp") VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
return False return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -73,49 +77,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
if "chrome-error:" in ytdlp_url: if "chrome-error:" in ytdlp_url:
return False return False
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False
logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)
if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
logging.info("found youtube watch page %r", ytdlp_url)
# connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
cluster = Cluster(["207.241.235.189"], protocol_version=5)
session = cluster.connect("video")
containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", str(urlcanon.aggressive(ytdlp_url))])
try:
rows = future.result()
except ReadTimeout:
logging.exception("Query timed out:")
if len(rows.current_rows) == 0:
logging.info("no results returned from videos query")
return True
for row in rows:
logging.info("video query found %r", row)
ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp))
logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
time_diff = datetime.datetime.now() - ytdlp_timestamp
# TODO: make variable for timedelta
if time_diff < datetime.timedelta(days = 90):
logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url)
return False
return True return True
def isyoutubehost(url): def isyoutubehost(url):

View File

@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs. seeds starving out other jobs.
``pdfs_only``
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on the MIME type set in the HTTP response's
Content-Type header. This value only impacts processing of outlinks within
Brozzler.
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
entry in the Warcprox-Meta header `mime-type-filters` key.*
``seeds`` ``seeds``
~~~~~~~~~ ~~~~~~~~~
+------------------------+----------+---------+ +------------------------+----------+---------+
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load. the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual. Then brozzling proceeds as usual.
``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*
Seed-level / top-level settings Seed-level / top-level settings
------------------------------- -------------------------------
These are seed settings that can also be specified at the top level, in which These are seed settings that can also be specified at the top level, in which