mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Merge branch 'master' into gmiller/2950-skip-ytdlp
This commit is contained in:
commit
4d0dfc150f
@ -47,6 +47,10 @@ class ProxyError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PageConnectionError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReachedTimeLimit(Exception):
|
class ReachedTimeLimit(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -483,6 +483,7 @@ class Browser:
|
|||||||
skip_extract_outlinks=False,
|
skip_extract_outlinks=False,
|
||||||
skip_visit_hashtags=False,
|
skip_visit_hashtags=False,
|
||||||
skip_youtube_dl=False,
|
skip_youtube_dl=False,
|
||||||
|
ytdlp_tmpdir="/tmp",
|
||||||
simpler404=False,
|
simpler404=False,
|
||||||
page_timeout=300,
|
page_timeout=300,
|
||||||
behavior_timeout=900,
|
behavior_timeout=900,
|
||||||
@ -658,11 +659,9 @@ class Browser:
|
|||||||
):
|
):
|
||||||
headers = extra_headers or {}
|
headers = extra_headers or {}
|
||||||
headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch
|
headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method="Network.setExtraHTTPHeaders", params={"headers": headers}
|
method="Network.setExtraHTTPHeaders", params={"headers": headers}
|
||||||
)
|
)
|
||||||
self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10)
|
|
||||||
if user_agent:
|
if user_agent:
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method="Network.setUserAgentOverride", params={"userAgent": user_agent}
|
method="Network.setUserAgentOverride", params={"userAgent": user_agent}
|
||||||
|
@ -265,6 +265,12 @@ def brozzle_page(argv=None):
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
"--skip-youtube-dl", dest="skip_youtube_dl", action="store_true"
|
"--skip-youtube-dl", dest="skip_youtube_dl", action="store_true"
|
||||||
)
|
)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
"--ytdlp_tmpdir",
|
||||||
|
dest="ytdlp_tmpdir",
|
||||||
|
default="/tmp",
|
||||||
|
help="specify a temp dir for ytdlp; defaults to /tmp",
|
||||||
|
)
|
||||||
arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true")
|
arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true")
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
@ -292,6 +298,7 @@ def brozzle_page(argv=None):
|
|||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
skip_youtube_dl=args.skip_youtube_dl,
|
skip_youtube_dl=args.skip_youtube_dl,
|
||||||
|
ytdlp_tmpdir=args.ytdlp_tmpdir,
|
||||||
simpler404=args.simpler404,
|
simpler404=args.simpler404,
|
||||||
screenshot_full_page=args.screenshot_full_page,
|
screenshot_full_page=args.screenshot_full_page,
|
||||||
download_throughput=args.download_throughput,
|
download_throughput=args.download_throughput,
|
||||||
@ -533,6 +540,12 @@ def brozzler_worker(argv=None):
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help=argparse.SUPPRESS,
|
help=argparse.SUPPRESS,
|
||||||
)
|
)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
"--ytdlp_tmpdir",
|
||||||
|
dest="ytdlp_tmpdir",
|
||||||
|
default="/tmp",
|
||||||
|
help="argparse.SUPPRESS",
|
||||||
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
"--stealth",
|
"--stealth",
|
||||||
dest="stealth",
|
dest="stealth",
|
||||||
@ -598,6 +611,7 @@ def brozzler_worker(argv=None):
|
|||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
skip_youtube_dl=args.skip_youtube_dl,
|
skip_youtube_dl=args.skip_youtube_dl,
|
||||||
|
ytdlp_tmpdir=args.ytdlp_tmpdir,
|
||||||
stealth=args.stealth,
|
stealth=args.stealth,
|
||||||
metrics_port=args.metrics_port,
|
metrics_port=args.metrics_port,
|
||||||
registry_url=args.registry_url,
|
registry_url=args.registry_url,
|
||||||
|
@ -138,7 +138,14 @@ class RethinkDbFrontier:
|
|||||||
emit=lambda acc, site, new_acc: r.branch(
|
emit=lambda acc, site, new_acc: r.branch(
|
||||||
r.and_(
|
r.and_(
|
||||||
r.or_(
|
r.or_(
|
||||||
site["claimed"].not_(),
|
# Avoid tight loop when unclaimed site was recently disclaimed
|
||||||
|
r.and_(
|
||||||
|
site["claimed"].not_(),
|
||||||
|
r.or_(
|
||||||
|
site.has_fields("last_disclaimed").not_(),
|
||||||
|
site["last_disclaimed"].lt(r.now().sub(20)),
|
||||||
|
),
|
||||||
|
),
|
||||||
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||||
),
|
),
|
||||||
r.or_(
|
r.or_(
|
||||||
@ -218,6 +225,11 @@ class RethinkDbFrontier:
|
|||||||
index="priority_by_site",
|
index="priority_by_site",
|
||||||
)
|
)
|
||||||
.order_by(index=r.desc("priority_by_site"))
|
.order_by(index=r.desc("priority_by_site"))
|
||||||
|
.filter(
|
||||||
|
lambda page: r.or_(
|
||||||
|
page.has_fields("retry_after").not_(), r.now() > page["retry_after"]
|
||||||
|
)
|
||||||
|
)
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.update(
|
.update(
|
||||||
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
|
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
|
||||||
|
@ -411,6 +411,10 @@ class Page(doublethink.Document):
|
|||||||
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
|
if not "retry_after" in self:
|
||||||
|
self.retry_after = None
|
||||||
|
if not "failed_attempts" in self:
|
||||||
|
self.failed_attempts = 0
|
||||||
if not "hops_from_seed" in self:
|
if not "hops_from_seed" in self:
|
||||||
self.hops_from_seed = 0
|
self.hops_from_seed = 0
|
||||||
if not "hop_path" in self:
|
if not "hop_path" in self:
|
||||||
|
@ -22,6 +22,7 @@ import logging
|
|||||||
import brozzler
|
import brozzler
|
||||||
import brozzler.browser
|
import brozzler.browser
|
||||||
from brozzler.model import VideoCaptureOptions
|
from brozzler.model import VideoCaptureOptions
|
||||||
|
import datetime
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@ -63,6 +64,7 @@ class BrozzlerWorker:
|
|||||||
skip_extract_outlinks=False,
|
skip_extract_outlinks=False,
|
||||||
skip_visit_hashtags=False,
|
skip_visit_hashtags=False,
|
||||||
skip_youtube_dl=False,
|
skip_youtube_dl=False,
|
||||||
|
ytdlp_tmpdir="/tmp",
|
||||||
simpler404=False,
|
simpler404=False,
|
||||||
screenshot_full_page=False,
|
screenshot_full_page=False,
|
||||||
page_timeout=300,
|
page_timeout=300,
|
||||||
@ -87,6 +89,7 @@ class BrozzlerWorker:
|
|||||||
self._skip_extract_outlinks = skip_extract_outlinks
|
self._skip_extract_outlinks = skip_extract_outlinks
|
||||||
self._skip_visit_hashtags = skip_visit_hashtags
|
self._skip_visit_hashtags = skip_visit_hashtags
|
||||||
self._skip_youtube_dl = skip_youtube_dl
|
self._skip_youtube_dl = skip_youtube_dl
|
||||||
|
self._ytdlp_tmpdir = ytdlp_tmpdir
|
||||||
self._simpler404 = simpler404
|
self._simpler404 = simpler404
|
||||||
self._screenshot_full_page = screenshot_full_page
|
self._screenshot_full_page = screenshot_full_page
|
||||||
self._page_timeout = page_timeout
|
self._page_timeout = page_timeout
|
||||||
@ -286,12 +289,13 @@ class BrozzlerWorker:
|
|||||||
browser, site, page, on_screenshot, on_request
|
browser, site, page, on_screenshot, on_request
|
||||||
)
|
)
|
||||||
outlinks.update(browser_outlinks)
|
outlinks.update(browser_outlinks)
|
||||||
|
status_code = browser.websock_thread.page_status
|
||||||
|
if status_code in [502, 504]:
|
||||||
|
raise brozzler.PageConnectionError()
|
||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
|
|
||||||
if enable_youtube_dl and ydl.should_ytdlp(
|
if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
|
||||||
site, page, browser.websock_thread.page_status
|
|
||||||
):
|
|
||||||
try:
|
try:
|
||||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
metrics.brozzler_ydl_urls_checked.inc(1)
|
metrics.brozzler_ydl_urls_checked.inc(1)
|
||||||
@ -439,7 +443,7 @@ class BrozzlerWorker:
|
|||||||
self.logger.trace("%r", chrome_msg)
|
self.logger.trace("%r", chrome_msg)
|
||||||
if chrome_msg.get("params", {}).get("versions"):
|
if chrome_msg.get("params", {}).get("versions"):
|
||||||
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
|
url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL")
|
||||||
if url and url not in sw_fetched:
|
if url and url.startswith("http") and url not in sw_fetched:
|
||||||
self.logger.info("fetching service worker script %s", url)
|
self.logger.info("fetching service worker script %s", url)
|
||||||
self._fetch_url(site, url=url)
|
self._fetch_url(site, url=url)
|
||||||
sw_fetched.add(url)
|
sw_fetched.add(url)
|
||||||
@ -466,6 +470,7 @@ class BrozzlerWorker:
|
|||||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||||
skip_youtube_dl=self._skip_youtube_dl,
|
skip_youtube_dl=self._skip_youtube_dl,
|
||||||
|
ytdlp_tmpdir=self._ytdlp_tmpdir,
|
||||||
simpler404=self._simpler404,
|
simpler404=self._simpler404,
|
||||||
screenshot_full_page=self._screenshot_full_page,
|
screenshot_full_page=self._screenshot_full_page,
|
||||||
page_timeout=self._page_timeout,
|
page_timeout=self._page_timeout,
|
||||||
@ -560,11 +565,25 @@ class BrozzlerWorker:
|
|||||||
# using brozzler-worker --proxy, nothing to do but try the
|
# using brozzler-worker --proxy, nothing to do but try the
|
||||||
# same proxy again next time
|
# same proxy again next time
|
||||||
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
|
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
|
||||||
except:
|
except (brozzler.PageConnectionError, Exception) as e:
|
||||||
self.logger.error(
|
if isinstance(e, brozzler.PageConnectionError):
|
||||||
"unexpected exception site=%r page=%r", site, page, exc_info=True
|
self.logger.error(
|
||||||
)
|
"Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
|
||||||
|
site,
|
||||||
|
page,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.error(
|
||||||
|
"unexpected exception site=%r page=%r", site, page, exc_info=True
|
||||||
|
)
|
||||||
if page:
|
if page:
|
||||||
|
# Calculate backoff in seconds based on number of failed attempts.
|
||||||
|
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
|
||||||
|
retry_delay = min(135, 60 * (1.5**page.failed_attempts))
|
||||||
|
page.retry_after = doublethink.utcnow() + datetime.timedelta(
|
||||||
|
seconds=retry_delay
|
||||||
|
)
|
||||||
page.failed_attempts = (page.failed_attempts or 0) + 1
|
page.failed_attempts = (page.failed_attempts or 0) + 1
|
||||||
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@ -575,6 +594,8 @@ class BrozzlerWorker:
|
|||||||
)
|
)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
page = None
|
page = None
|
||||||
|
else:
|
||||||
|
page.save()
|
||||||
finally:
|
finally:
|
||||||
if start:
|
if start:
|
||||||
site.active_brozzling_time = (
|
site.active_brozzling_time = (
|
||||||
|
@ -34,8 +34,9 @@ import time
|
|||||||
|
|
||||||
thread_local = threading.local()
|
thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
YTDLP_PROXY = ""
|
YTDLP_PROXY = ""
|
||||||
MAX_YTDLP_ATTEMPTS = 4
|
PROXY_ATTEMPTS = 4
|
||||||
YTDLP_WAIT = 10
|
YTDLP_WAIT = 10
|
||||||
|
|
||||||
|
|
||||||
@ -314,8 +315,9 @@ def _remember_videos(page, pushed_videos=None):
|
|||||||
|
|
||||||
|
|
||||||
def _try_youtube_dl(worker, ydl, site, page):
|
def _try_youtube_dl(worker, ydl, site, page):
|
||||||
|
max_attempts = PROXY_ATTEMPTS if ydl.is_youtube_host else 1
|
||||||
attempt = 0
|
attempt = 0
|
||||||
while attempt < MAX_YTDLP_ATTEMPTS:
|
while attempt < max_attempts:
|
||||||
try:
|
try:
|
||||||
logging.info("trying yt-dlp on %s", ydl.url)
|
logging.info("trying yt-dlp on %s", ydl.url)
|
||||||
# should_download_vid = not ydl.is_youtube_host
|
# should_download_vid = not ydl.is_youtube_host
|
||||||
@ -353,9 +355,9 @@ def _try_youtube_dl(worker, ydl, site, page):
|
|||||||
# OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...)
|
# OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...)
|
||||||
# and others...
|
# and others...
|
||||||
attempt += 1
|
attempt += 1
|
||||||
if attempt == MAX_YTDLP_ATTEMPTS:
|
if attempt == max_attempts:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e
|
"Failed after %s attempt(s). Error: %s", max_attempts, e
|
||||||
)
|
)
|
||||||
raise brozzler.VideoExtractorError(
|
raise brozzler.VideoExtractorError(
|
||||||
"yt-dlp hit error extracting info for %s" % ydl.url
|
"yt-dlp hit error extracting info for %s" % ydl.url
|
||||||
@ -409,7 +411,10 @@ def do_youtube_dl(worker, site, page):
|
|||||||
Returns:
|
Returns:
|
||||||
`list` of `str`: outlink urls
|
`list` of `str`: outlink urls
|
||||||
"""
|
"""
|
||||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
with tempfile.TemporaryDirectory(
|
||||||
|
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
|
||||||
|
) as tempdir:
|
||||||
|
logging.info("tempdir for yt-dlp: %s", tempdir)
|
||||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||||
outlinks = set()
|
outlinks = set()
|
||||||
|
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="brozzler",
|
name="brozzler",
|
||||||
version="1.6.3",
|
version="1.6.5",
|
||||||
description="Distributed web crawling with browsers",
|
description="Distributed web crawling with browsers",
|
||||||
url="https://github.com/internetarchive/brozzler",
|
url="https://github.com/internetarchive/brozzler",
|
||||||
author="Noah Levitt",
|
author="Noah Levitt",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user