Merge branch 'master' into gmiller/2950-skip-ytdlp

This commit is contained in:
Gretchen Miller 2024-12-13 08:50:44 -08:00
commit 4d0dfc150f
8 changed files with 76 additions and 17 deletions

View File

@ -47,6 +47,10 @@ class ProxyError(Exception):
pass pass
class PageConnectionError(Exception):
pass
class ReachedTimeLimit(Exception): class ReachedTimeLimit(Exception):
pass pass

View File

@ -483,6 +483,7 @@ class Browser:
skip_extract_outlinks=False, skip_extract_outlinks=False,
skip_visit_hashtags=False, skip_visit_hashtags=False,
skip_youtube_dl=False, skip_youtube_dl=False,
ytdlp_tmpdir="/tmp",
simpler404=False, simpler404=False,
page_timeout=300, page_timeout=300,
behavior_timeout=900, behavior_timeout=900,
@ -658,11 +659,9 @@ class Browser:
): ):
headers = extra_headers or {} headers = extra_headers or {}
headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method="Network.setExtraHTTPHeaders", params={"headers": headers} method="Network.setExtraHTTPHeaders", params={"headers": headers}
) )
self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10)
if user_agent: if user_agent:
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(
method="Network.setUserAgentOverride", params={"userAgent": user_agent} method="Network.setUserAgentOverride", params={"userAgent": user_agent}

View File

@ -265,6 +265,12 @@ def brozzle_page(argv=None):
arg_parser.add_argument( arg_parser.add_argument(
"--skip-youtube-dl", dest="skip_youtube_dl", action="store_true" "--skip-youtube-dl", dest="skip_youtube_dl", action="store_true"
) )
arg_parser.add_argument(
"--ytdlp_tmpdir",
dest="ytdlp_tmpdir",
default="/tmp",
help="specify a temp dir for ytdlp; defaults to /tmp",
)
arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true") arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true")
add_common_options(arg_parser, argv) add_common_options(arg_parser, argv)
@ -292,6 +298,7 @@ def brozzle_page(argv=None):
skip_extract_outlinks=args.skip_extract_outlinks, skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags, skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl, skip_youtube_dl=args.skip_youtube_dl,
ytdlp_tmpdir=args.ytdlp_tmpdir,
simpler404=args.simpler404, simpler404=args.simpler404,
screenshot_full_page=args.screenshot_full_page, screenshot_full_page=args.screenshot_full_page,
download_throughput=args.download_throughput, download_throughput=args.download_throughput,
@ -533,6 +540,12 @@ def brozzler_worker(argv=None):
action="store_true", action="store_true",
help=argparse.SUPPRESS, help=argparse.SUPPRESS,
) )
arg_parser.add_argument(
"--ytdlp_tmpdir",
dest="ytdlp_tmpdir",
default="/tmp",
help="argparse.SUPPRESS",
)
arg_parser.add_argument( arg_parser.add_argument(
"--stealth", "--stealth",
dest="stealth", dest="stealth",
@ -598,6 +611,7 @@ def brozzler_worker(argv=None):
skip_extract_outlinks=args.skip_extract_outlinks, skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags, skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl, skip_youtube_dl=args.skip_youtube_dl,
ytdlp_tmpdir=args.ytdlp_tmpdir,
stealth=args.stealth, stealth=args.stealth,
metrics_port=args.metrics_port, metrics_port=args.metrics_port,
registry_url=args.registry_url, registry_url=args.registry_url,

View File

@ -138,7 +138,14 @@ class RethinkDbFrontier:
emit=lambda acc, site, new_acc: r.branch( emit=lambda acc, site, new_acc: r.branch(
r.and_( r.and_(
r.or_( r.or_(
# Avoid tight loop when unclaimed site was recently disclaimed
r.and_(
site["claimed"].not_(), site["claimed"].not_(),
r.or_(
site.has_fields("last_disclaimed").not_(),
site["last_disclaimed"].lt(r.now().sub(20)),
),
),
site["last_claimed"].lt(r.now().sub(60 * 60)), site["last_claimed"].lt(r.now().sub(60 * 60)),
), ),
r.or_( r.or_(
@ -218,6 +225,11 @@ class RethinkDbFrontier:
index="priority_by_site", index="priority_by_site",
) )
.order_by(index=r.desc("priority_by_site")) .order_by(index=r.desc("priority_by_site"))
.filter(
lambda page: r.or_(
page.has_fields("retry_after").not_(), r.now() > page["retry_after"]
)
)
.limit(1) .limit(1)
.update( .update(
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always" {"claimed": True, "last_claimed_by": worker_id}, return_changes="always"

View File

@ -411,6 +411,10 @@ class Page(doublethink.Document):
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest() return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def populate_defaults(self): def populate_defaults(self):
if not "retry_after" in self:
self.retry_after = None
if not "failed_attempts" in self:
self.failed_attempts = 0
if not "hops_from_seed" in self: if not "hops_from_seed" in self:
self.hops_from_seed = 0 self.hops_from_seed = 0
if not "hop_path" in self: if not "hop_path" in self:

View File

@ -22,6 +22,7 @@ import logging
import brozzler import brozzler
import brozzler.browser import brozzler.browser
from brozzler.model import VideoCaptureOptions from brozzler.model import VideoCaptureOptions
import datetime
import threading import threading
import time import time
import urllib.request import urllib.request
@ -63,6 +64,7 @@ class BrozzlerWorker:
skip_extract_outlinks=False, skip_extract_outlinks=False,
skip_visit_hashtags=False, skip_visit_hashtags=False,
skip_youtube_dl=False, skip_youtube_dl=False,
ytdlp_tmpdir="/tmp",
simpler404=False, simpler404=False,
screenshot_full_page=False, screenshot_full_page=False,
page_timeout=300, page_timeout=300,
@ -87,6 +89,7 @@ class BrozzlerWorker:
self._skip_extract_outlinks = skip_extract_outlinks self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl self._skip_youtube_dl = skip_youtube_dl
self._ytdlp_tmpdir = ytdlp_tmpdir
self._simpler404 = simpler404 self._simpler404 = simpler404
self._screenshot_full_page = screenshot_full_page self._screenshot_full_page = screenshot_full_page
self._page_timeout = page_timeout self._page_timeout = page_timeout
@ -286,12 +289,13 @@ class BrozzlerWorker:
browser, site, page, on_screenshot, on_request browser, site, page, on_screenshot, on_request
) )
outlinks.update(browser_outlinks) outlinks.update(browser_outlinks)
status_code = browser.websock_thread.page_status
if status_code in [502, 504]:
raise brozzler.PageConnectionError()
except brozzler.PageInterstitialShown: except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page) self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp( if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
site, page, browser.websock_thread.page_status
):
try: try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page) ydl_outlinks = ydl.do_youtube_dl(self, site, page)
metrics.brozzler_ydl_urls_checked.inc(1) metrics.brozzler_ydl_urls_checked.inc(1)
@ -439,7 +443,7 @@ class BrozzlerWorker:
self.logger.trace("%r", chrome_msg) self.logger.trace("%r", chrome_msg)
if chrome_msg.get("params", {}).get("versions"): if chrome_msg.get("params", {}).get("versions"):
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL") url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
if url and url not in sw_fetched: if url and url.startswith("http") and url not in sw_fetched:
self.logger.info("fetching service worker script %s", url) self.logger.info("fetching service worker script %s", url)
self._fetch_url(site, url=url) self._fetch_url(site, url=url)
sw_fetched.add(url) sw_fetched.add(url)
@ -466,6 +470,7 @@ class BrozzlerWorker:
skip_extract_outlinks=self._skip_extract_outlinks, skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags, skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl, skip_youtube_dl=self._skip_youtube_dl,
ytdlp_tmpdir=self._ytdlp_tmpdir,
simpler404=self._simpler404, simpler404=self._simpler404,
screenshot_full_page=self._screenshot_full_page, screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout, page_timeout=self._page_timeout,
@ -560,11 +565,25 @@ class BrozzlerWorker:
# using brozzler-worker --proxy, nothing to do but try the # using brozzler-worker --proxy, nothing to do but try the
# same proxy again next time # same proxy again next time
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1) logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
except: except (brozzler.PageConnectionError, Exception) as e:
if isinstance(e, brozzler.PageConnectionError):
self.logger.error(
"Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
site,
page,
exc_info=True,
)
else:
self.logger.error( self.logger.error(
"unexpected exception site=%r page=%r", site, page, exc_info=True "unexpected exception site=%r page=%r", site, page, exc_info=True
) )
if page: if page:
# Calculate backoff in seconds based on number of failed attempts.
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
retry_delay = min(135, 60 * (1.5**page.failed_attempts))
page.retry_after = doublethink.utcnow() + datetime.timedelta(
seconds=retry_delay
)
page.failed_attempts = (page.failed_attempts or 0) + 1 page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info( self.logger.info(
@ -575,6 +594,8 @@ class BrozzlerWorker:
) )
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
page = None page = None
else:
page.save()
finally: finally:
if start: if start:
site.active_brozzling_time = ( site.active_brozzling_time = (

View File

@ -34,8 +34,9 @@ import time
thread_local = threading.local() thread_local = threading.local()
YTDLP_PROXY = "" YTDLP_PROXY = ""
MAX_YTDLP_ATTEMPTS = 4 PROXY_ATTEMPTS = 4
YTDLP_WAIT = 10 YTDLP_WAIT = 10
@ -314,8 +315,9 @@ def _remember_videos(page, pushed_videos=None):
def _try_youtube_dl(worker, ydl, site, page): def _try_youtube_dl(worker, ydl, site, page):
max_attempts = PROXY_ATTEMPTS if ydl.is_youtube_host else 1
attempt = 0 attempt = 0
while attempt < MAX_YTDLP_ATTEMPTS: while attempt < max_attempts:
try: try:
logging.info("trying yt-dlp on %s", ydl.url) logging.info("trying yt-dlp on %s", ydl.url)
# should_download_vid = not ydl.is_youtube_host # should_download_vid = not ydl.is_youtube_host
@ -353,9 +355,9 @@ def _try_youtube_dl(worker, ydl, site, page):
# OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...)
# and others... # and others...
attempt += 1 attempt += 1
if attempt == MAX_YTDLP_ATTEMPTS: if attempt == max_attempts:
logging.warning( logging.warning(
"Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e "Failed after %s attempt(s). Error: %s", max_attempts, e
) )
raise brozzler.VideoExtractorError( raise brozzler.VideoExtractorError(
"yt-dlp hit error extracting info for %s" % ydl.url "yt-dlp hit error extracting info for %s" % ydl.url
@ -409,7 +411,10 @@ def do_youtube_dl(worker, site, page):
Returns: Returns:
`list` of `str`: outlink urls `list` of `str`: outlink urls
""" """
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: with tempfile.TemporaryDirectory(
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
) as tempdir:
logging.info("tempdir for yt-dlp: %s", tempdir)
ydl = _build_youtube_dl(worker, tempdir, site, page) ydl = _build_youtube_dl(worker, tempdir, site, page)
ie_result = _try_youtube_dl(worker, ydl, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set() outlinks = set()

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name="brozzler", name="brozzler",
version="1.6.3", version="1.6.5",
description="Distributed web crawling with browsers", description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler", url="https://github.com/internetarchive/brozzler",
author="Noah Levitt", author="Noah Levitt",