Merge branch 'gmiller/2950-skip-ytdlp' into qa

This commit is contained in:
Gretchen Miller 2024-09-30 17:01:04 -07:00
commit 04c00d21c5
5 changed files with 110 additions and 81 deletions

View File

@ -585,27 +585,12 @@ def brozzler_worker(argv=None):
finally:
signal.signal(signal.SIGQUIT, dump_state)
def get_skip_av_seeds():
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds
rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,

View File

@ -34,6 +34,7 @@ import urllib
import uuid
import yaml
import zlib
from enum import Enum
from typing import Optional
@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save()
sites = []
@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self):
if not "status" in self:
self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
@ -220,33 +225,53 @@ class Job(doublethink.Document, ElapsedMixIn):
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "sites"
def populate_defaults(self):
if not "status" in self:
if "status" not in self:
self.status = "ACTIVE"
if not "claimed" in self:
if "claimed" not in self:
self.claimed = False
if not "last_disclaimed" in self:
if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self:
if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
if "scope" not in self:
self.scope = {}
if not "skip_ytdlp" in self:
self.skip_ytdlp = None
if "video_capture" not in self:
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
# backward compatibility
if "surt" in self.scope:
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"]
# backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
@ -256,7 +281,7 @@ class Site(doublethink.Document, ElapsedMixIn):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)
if not "starts_and_stops" in self:
if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None}
@ -271,7 +296,7 @@ class Site(doublethink.Document, ElapsedMixIn):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
simple_rule_ssurts = (
rule["ssurt"]

View File

@ -21,6 +21,7 @@ limitations under the License.
import logging
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
import threading
import time
import urllib.request
@ -55,7 +56,6 @@ class BrozzlerWorker:
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
@ -78,7 +78,6 @@ class BrozzlerWorker:
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers
self._warcprox_auto = warcprox_auto
@ -268,7 +267,17 @@ class BrozzlerWorker:
if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
if site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info("skipping non-PDF content: PDFs only option enabled")
elif site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
"skipping video content: video MIME type capture disabled for site"
)
else:
self._fetch_url(site, page=page)
else:
self.logger.info("needs browsing: %s", page)
try:
@ -280,7 +289,7 @@ class BrozzlerWorker:
self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp(
site, page, browser.websock_thread.page_status, self._skip_av_seeds
site, page, browser.websock_thread.page_status
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@ -327,13 +336,29 @@ class BrozzlerWorker:
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {}
def _needs_browsing(self, page_headers):
if (
def _needs_browsing(self, page_headers) -> bool:
return not bool(
"content-type" in page_headers
and "html" not in page_headers["content-type"]
):
return False
return True
)
def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)
def _is_pdf(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it is a PDF.
"""
return (
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)
@metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress()

View File

@ -20,6 +20,7 @@ import logging
import yt_dlp
from yt_dlp.utils import match_filter_func
import brozzler
from brozzler.model import VideoCaptureOptions
import urllib.request
import tempfile
import urlcanon
@ -58,14 +59,17 @@ def _timestamp4datetime(timestamp):
int(timestamp[-2:])
)
def should_ytdlp(site, page, page_status, skip_av_seeds):
def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check
if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdlp")
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -73,49 +77,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
if "chrome-error:" in ytdlp_url:
return False
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False
logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)
if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
logging.info("found youtube watch page %r", ytdlp_url)
# connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
cluster = Cluster(["207.241.235.189"], protocol_version=5)
session = cluster.connect("video")
containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", str(urlcanon.aggressive(ytdlp_url))])
try:
rows = future.result()
except ReadTimeout:
logging.exception("Query timed out:")
if len(rows.current_rows) == 0:
logging.info("no results returned from videos query")
return True
for row in rows:
logging.info("video query found %r", row)
ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp))
logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
time_diff = datetime.datetime.now() - ytdlp_timestamp
# TODO: make variable for timedelta
if time_diff < datetime.timedelta(days = 90):
logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url)
return False
return True
def isyoutubehost(url):

View File

@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs.
``pdfs_only``
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on the MIME type set in the HTTP response's
Content-Type header. This value only impacts processing of outlinks within
Brozzler.
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
entry in the Warcprox-Meta header `mime-type-filters` key.*
``seeds``
~~~~~~~~~
+------------------------+----------+---------+
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual.
``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*
Seed-level / top-level settings
-------------------------------
These are seed settings that can also be specified at the top level, in which