mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-25 00:59:52 -05:00
Merge branch 'gmiller/2950-skip-ytdlp' into qa
This commit is contained in:
commit
04c00d21c5
@ -585,27 +585,12 @@ def brozzler_worker(argv=None):
|
|||||||
finally:
|
finally:
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
|
|
||||||
def get_skip_av_seeds():
|
|
||||||
# TODO: develop UI and refactor
|
|
||||||
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
|
|
||||||
try:
|
|
||||||
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
|
||||||
with open(SKIP_AV_SEEDS_FILE) as skips:
|
|
||||||
skip_av_seeds = {int(l) for l in skips.readlines()}
|
|
||||||
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
|
|
||||||
except Exception as e:
|
|
||||||
skip_av_seeds = set()
|
|
||||||
logging.info("running with empty skip_av_seeds")
|
|
||||||
return skip_av_seeds
|
|
||||||
|
|
||||||
rr = rethinker(args)
|
rr = rethinker(args)
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
service_registry = doublethink.ServiceRegistry(rr)
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
skip_av_seeds_from_file = get_skip_av_seeds()
|
|
||||||
worker = brozzler.worker.BrozzlerWorker(
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
frontier,
|
frontier,
|
||||||
service_registry,
|
service_registry,
|
||||||
skip_av_seeds=skip_av_seeds_from_file,
|
|
||||||
max_browsers=int(args.max_browsers),
|
max_browsers=int(args.max_browsers),
|
||||||
chrome_exe=args.chrome_exe,
|
chrome_exe=args.chrome_exe,
|
||||||
proxy=args.proxy,
|
proxy=args.proxy,
|
||||||
|
@ -34,6 +34,7 @@ import urllib
|
|||||||
import uuid
|
import uuid
|
||||||
import yaml
|
import yaml
|
||||||
import zlib
|
import zlib
|
||||||
|
from enum import Enum
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
|
|||||||
job.id = job_conf["id"]
|
job.id = job_conf["id"]
|
||||||
if "max_claimed_sites" in job_conf:
|
if "max_claimed_sites" in job_conf:
|
||||||
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
||||||
|
if "pdfs_only" in job_conf:
|
||||||
|
job.pdfs_only = job_conf["pdfs_only"]
|
||||||
job.save()
|
job.save()
|
||||||
|
|
||||||
sites = []
|
sites = []
|
||||||
@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
|
|||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "status" in self:
|
if not "status" in self:
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
|
if "pdfs_only" not in self:
|
||||||
|
self.pdfs_only = False
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("started"): # backward compatibility
|
if self.get("started"): # backward compatibility
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [
|
||||||
@ -220,33 +225,53 @@ class Job(doublethink.Document, ElapsedMixIn):
|
|||||||
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
|
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
|
||||||
|
|
||||||
|
|
||||||
|
class VideoCaptureOptions(Enum):
|
||||||
|
"""
|
||||||
|
Enumeration of possible values for the `video_capture` config key.
|
||||||
|
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
|
||||||
|
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
|
||||||
|
combination of the next two values.
|
||||||
|
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
|
||||||
|
containing the word "video" is not captured.
|
||||||
|
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||||
|
|
||||||
|
Note: Ensuring full video MIME type blocking requires an additional entry in the
|
||||||
|
Warcprox-Meta header `mime-type-filters` key.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
|
||||||
|
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
|
||||||
|
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
|
||||||
|
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
|
||||||
|
|
||||||
|
|
||||||
class Site(doublethink.Document, ElapsedMixIn):
|
class Site(doublethink.Document, ElapsedMixIn):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = "sites"
|
table = "sites"
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "status" in self:
|
if "status" not in self:
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
if not "claimed" in self:
|
if "claimed" not in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if not "last_disclaimed" in self:
|
if "last_disclaimed" not in self:
|
||||||
self.last_disclaimed = brozzler.EPOCH_UTC
|
self.last_disclaimed = brozzler.EPOCH_UTC
|
||||||
if not "last_claimed" in self:
|
if "last_claimed" not in self:
|
||||||
self.last_claimed = brozzler.EPOCH_UTC
|
self.last_claimed = brozzler.EPOCH_UTC
|
||||||
if not "scope" in self:
|
if "scope" not in self:
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if not "skip_ytdlp" in self:
|
if "video_capture" not in self:
|
||||||
self.skip_ytdlp = None
|
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
|
||||||
|
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
if "surt" in self.scope:
|
if "surt" in self.scope:
|
||||||
if not "accepts" in self.scope:
|
if "accepts" not in self.scope:
|
||||||
self.scope["accepts"] = []
|
self.scope["accepts"] = []
|
||||||
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
||||||
del self.scope["surt"]
|
del self.scope["surt"]
|
||||||
|
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
|
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
|
||||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||||
if "max_hops_off_surt" in self.scope:
|
if "max_hops_off_surt" in self.scope:
|
||||||
del self.scope["max_hops_off_surt"]
|
del self.scope["max_hops_off_surt"]
|
||||||
@ -256,7 +281,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
||||||
)
|
)
|
||||||
|
|
||||||
if not "starts_and_stops" in self:
|
if "starts_and_stops" not in self:
|
||||||
if self.get("start_time"): # backward compatibility
|
if self.get("start_time"): # backward compatibility
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [
|
||||||
{"start": self.get("start_time"), "stop": None}
|
{"start": self.get("start_time"), "stop": None}
|
||||||
@ -271,7 +296,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||||
|
|
||||||
def _accept_ssurt_if_not_redundant(self, ssurt):
|
def _accept_ssurt_if_not_redundant(self, ssurt):
|
||||||
if not "accepts" in self.scope:
|
if "accepts" not in self.scope:
|
||||||
self.scope["accepts"] = []
|
self.scope["accepts"] = []
|
||||||
simple_rule_ssurts = (
|
simple_rule_ssurts = (
|
||||||
rule["ssurt"]
|
rule["ssurt"]
|
||||||
|
@ -21,6 +21,7 @@ limitations under the License.
|
|||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
import brozzler.browser
|
import brozzler.browser
|
||||||
|
from brozzler.model import VideoCaptureOptions
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@ -55,7 +56,6 @@ class BrozzlerWorker:
|
|||||||
self,
|
self,
|
||||||
frontier,
|
frontier,
|
||||||
service_registry=None,
|
service_registry=None,
|
||||||
skip_av_seeds=None,
|
|
||||||
max_browsers=1,
|
max_browsers=1,
|
||||||
chrome_exe="chromium-browser",
|
chrome_exe="chromium-browser",
|
||||||
warcprox_auto=False,
|
warcprox_auto=False,
|
||||||
@ -78,7 +78,6 @@ class BrozzlerWorker:
|
|||||||
):
|
):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._skip_av_seeds = skip_av_seeds
|
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
|
||||||
self._warcprox_auto = warcprox_auto
|
self._warcprox_auto = warcprox_auto
|
||||||
@ -268,6 +267,16 @@ class BrozzlerWorker:
|
|||||||
|
|
||||||
if not self._needs_browsing(page_headers):
|
if not self._needs_browsing(page_headers):
|
||||||
self.logger.info("needs fetch: %s", page)
|
self.logger.info("needs fetch: %s", page)
|
||||||
|
if site.pdfs_only and not self._is_pdf(page_headers):
|
||||||
|
self.logger.info("skipping non-PDF content: PDFs only option enabled")
|
||||||
|
elif site.video_capture in [
|
||||||
|
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||||
|
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
|
||||||
|
] and self._is_video_type(page_headers):
|
||||||
|
self.logger.info(
|
||||||
|
"skipping video content: video MIME type capture disabled for site"
|
||||||
|
)
|
||||||
|
else:
|
||||||
self._fetch_url(site, page=page)
|
self._fetch_url(site, page=page)
|
||||||
else:
|
else:
|
||||||
self.logger.info("needs browsing: %s", page)
|
self.logger.info("needs browsing: %s", page)
|
||||||
@ -280,7 +289,7 @@ class BrozzlerWorker:
|
|||||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
|
|
||||||
if enable_youtube_dl and ydl.should_ytdlp(
|
if enable_youtube_dl and ydl.should_ytdlp(
|
||||||
site, page, browser.websock_thread.page_status, self._skip_av_seeds
|
site, page, browser.websock_thread.page_status
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
@ -327,13 +336,29 @@ class BrozzlerWorker:
|
|||||||
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _needs_browsing(self, page_headers):
|
def _needs_browsing(self, page_headers) -> bool:
|
||||||
if (
|
return not bool(
|
||||||
"content-type" in page_headers
|
"content-type" in page_headers
|
||||||
and "html" not in page_headers["content-type"]
|
and "html" not in page_headers["content-type"]
|
||||||
):
|
)
|
||||||
return False
|
|
||||||
return True
|
def _is_video_type(self, page_headers) -> bool:
|
||||||
|
"""
|
||||||
|
Determines if the page's Content-Type header specifies that it contains
|
||||||
|
a video.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
"content-type" in page_headers and "video" in page_headers["content-type"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _is_pdf(self, page_headers) -> bool:
|
||||||
|
"""
|
||||||
|
Determines if the page's Content-Type header specifies that it is a PDF.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
"content-type" in page_headers
|
||||||
|
and "application/pdf" in page_headers["content-type"]
|
||||||
|
)
|
||||||
|
|
||||||
@metrics.brozzler_page_processing_duration_seconds.time()
|
@metrics.brozzler_page_processing_duration_seconds.time()
|
||||||
@metrics.brozzler_in_progress_pages.track_inprogress()
|
@metrics.brozzler_in_progress_pages.track_inprogress()
|
||||||
|
@ -20,6 +20,7 @@ import logging
|
|||||||
import yt_dlp
|
import yt_dlp
|
||||||
from yt_dlp.utils import match_filter_func
|
from yt_dlp.utils import match_filter_func
|
||||||
import brozzler
|
import brozzler
|
||||||
|
from brozzler.model import VideoCaptureOptions
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import tempfile
|
import tempfile
|
||||||
import urlcanon
|
import urlcanon
|
||||||
@ -58,14 +59,17 @@ def _timestamp4datetime(timestamp):
|
|||||||
int(timestamp[-2:])
|
int(timestamp[-2:])
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_ytdlp(site, page, page_status, skip_av_seeds):
|
def should_ytdlp(site, page, page_status):
|
||||||
# called only after we've passed needs_browsing() check
|
# called only after we've passed needs_browsing() check
|
||||||
|
|
||||||
if page_status != 200:
|
if page_status != 200:
|
||||||
logging.info("skipping ytdlp: non-200 page status %s", page_status)
|
logging.info("skipping ytdlp: non-200 page status %s", page_status)
|
||||||
return False
|
return False
|
||||||
if site.skip_ytdlp:
|
if site.video_capture in [
|
||||||
logging.info("skipping ytdlp: site marked skip_ytdlp")
|
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||||
|
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
|
||||||
|
]:
|
||||||
|
logging.info("skipping ytdlp: site has video capture disabled")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||||
@ -73,49 +77,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
|
|||||||
if "chrome-error:" in ytdlp_url:
|
if "chrome-error:" in ytdlp_url:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
ytdlp_seed = (
|
|
||||||
site["metadata"]["ait_seed_id"]
|
|
||||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: develop UI and refactor
|
|
||||||
if ytdlp_seed:
|
|
||||||
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
|
|
||||||
logging.info("skipping ytdlp: site in skip_av_seeds")
|
|
||||||
site.skip_ytdlp = True
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
site.skip_ytdlp = False
|
|
||||||
|
|
||||||
logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)
|
|
||||||
|
|
||||||
if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
|
|
||||||
logging.info("found youtube watch page %r", ytdlp_url)
|
|
||||||
# connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
|
|
||||||
cluster = Cluster(["207.241.235.189"], protocol_version=5)
|
|
||||||
session = cluster.connect("video")
|
|
||||||
containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
|
|
||||||
future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", str(urlcanon.aggressive(ytdlp_url))])
|
|
||||||
try:
|
|
||||||
rows = future.result()
|
|
||||||
except ReadTimeout:
|
|
||||||
logging.exception("Query timed out:")
|
|
||||||
|
|
||||||
if len(rows.current_rows) == 0:
|
|
||||||
logging.info("no results returned from videos query")
|
|
||||||
return True
|
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
logging.info("video query found %r", row)
|
|
||||||
ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp))
|
|
||||||
logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
|
|
||||||
time_diff = datetime.datetime.now() - ytdlp_timestamp
|
|
||||||
# TODO: make variable for timedelta
|
|
||||||
if time_diff < datetime.timedelta(days = 90):
|
|
||||||
logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url)
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def isyoutubehost(url):
|
def isyoutubehost(url):
|
||||||
|
33
job-conf.rst
33
job-conf.rst
@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
|
|||||||
simultaneously across the cluster. Addresses the problem of a job with many
|
simultaneously across the cluster. Addresses the problem of a job with many
|
||||||
seeds starving out other jobs.
|
seeds starving out other jobs.
|
||||||
|
|
||||||
|
``pdfs_only``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
+---------+----------+-----------+
|
||||||
|
| type | required | default |
|
||||||
|
+=========+==========+===========+
|
||||||
|
| boolean | no | ``false`` |
|
||||||
|
+---------+----------+-----------+
|
||||||
|
Limits capture to PDFs based on the MIME type set in the HTTP response's
|
||||||
|
Content-Type header. This value only impacts processing of outlinks within
|
||||||
|
Brozzler.
|
||||||
|
|
||||||
|
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
|
||||||
|
entry in the Warcprox-Meta header `mime-type-filters` key.*
|
||||||
|
|
||||||
``seeds``
|
``seeds``
|
||||||
~~~~~~~~~
|
~~~~~~~~~
|
||||||
+------------------------+----------+---------+
|
+------------------------+----------+---------+
|
||||||
@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
|
|||||||
the default values in place. Brozzler submits login forms after page load.
|
the default values in place. Brozzler submits login forms after page load.
|
||||||
Then brozzling proceeds as usual.
|
Then brozzling proceeds as usual.
|
||||||
|
|
||||||
|
``video_capture``
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
+--------+----------+--------------------------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+==========================+
|
||||||
|
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
|
||||||
|
+--------+----------+--------------------------+
|
||||||
|
Determines the level of video capture for the seed. This is an enumeration with four possible values:
|
||||||
|
|
||||||
|
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
|
||||||
|
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
|
||||||
|
combination of the next two values.
|
||||||
|
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
|
||||||
|
the word "video" is not captured.
|
||||||
|
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||||
|
|
||||||
|
*Note: Ensuring full video MIME type blocking requires an additional entry in
|
||||||
|
the Warcprox-Meta header `mime-type-filters` key.*
|
||||||
|
|
||||||
Seed-level / top-level settings
|
Seed-level / top-level settings
|
||||||
-------------------------------
|
-------------------------------
|
||||||
These are seed settings that can also be specified at the top level, in which
|
These are seed settings that can also be specified at the top level, in which
|
||||||
|
Loading…
x
Reference in New Issue
Block a user