From 6a0b0b058d4e1860073b3abd7bd9ab1e481cee04 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 18:37:14 -0700 Subject: [PATCH] updates post-walkthru --- brozzler/__init__.py | 4 ++++ brozzler/cli.py | 12 +++++------ brozzler/worker.py | 14 ++++++++++--- brozzler/ydl.py | 49 ++++++++++++++++++++++---------------------- 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 5040e69..7dd284d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -39,6 +39,10 @@ class PageInterstitialShown(Exception): pass +class VideoExtractorError(Exception): + pass + + class ProxyError(Exception): pass diff --git a/brozzler/cli.py b/brozzler/cli.py index 11226fa..3cb7c9a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -238,7 +238,7 @@ def brozzle_page(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -251,7 +251,7 @@ def brozzle_page(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" @@ -298,7 +298,7 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) @@ -543,7 +543,7 @@ def brozzler_worker(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -556,7 +556,7 @@ def brozzler_worker(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) add_common_options(arg_parser, argv) @@ -614,7 +614,7 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) diff --git a/brozzler/worker.py b/brozzler/worker.py index e4f984d..c780644 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -72,7 +72,7 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, - metrics_port=None, + metrics_port=0, registry_url=None, env=None, ): @@ -111,8 +111,11 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() - # Setup metrics - metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + # set up metrics + if self._metrics_port > 0: + metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + else: + logging.warning("not starting prometheus scrape endpoint: metrics_port is undefined") def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") @@ -284,6 +287,11 @@ class BrozzlerWorker: raise except brozzler.ProxyError: raise + except brozzler.VideoExtractorError as e: + logging.error( + "error extracting video info: %s", + e, + ) except Exception as e: if ( hasattr(e, "exc_info") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ebfadce..b1266fd 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -71,6 +71,11 @@ def should_ytdlp(site, page, page_status, skip_av_seeds): return True +def isyoutubehost(url): + # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname + return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] + + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers @@ -237,14 +242,9 @@ def _build_youtube_dl(worker, destdir, site, page): worker.logger.info( "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) ) - youtube_host = ( - "youtube.com" - in d["info_dict"]["webpage_url"] - .split("//")[-1] - .split("/")[0] - .split("?")[0] - ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"]) + + metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1) if worker._using_warcprox(site): _YoutubeDL._push_video_to_warcprox( _YoutubeDL, site, d["info_dict"], d["postprocessor"] @@ -283,15 +283,14 @@ def _build_youtube_dl(worker, destdir, site, page): } ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = ( - "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] - ) - if youtube_host and YTDLP_PROXY: + is_youtube_host = isyoutubehost(ytdlp_url) + if is_youtube_host and YTDLP_PROXY: ydl_opts["proxy"] = YTDLP_PROXY - ytdlp_proxy_for_print = ( + # don't log proxy value secrets + ytdlp_proxy_for_logs = ( YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" ) - logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_print) + logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges # if worker._proxy_for(site): @@ -302,7 +301,7 @@ def _build_youtube_dl(worker, destdir, site, page): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] ydl.url = ytdlp_url - ydl.youtube_host = youtube_host + ydl.is_youtube_host = is_youtube_host return ydl @@ -330,12 +329,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ydl.url) - # should_download_vid = not ydl.youtube_host + # should_download_vid = not ydl.is_youtube_host # then # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) - # if ydl.youtube_host and ie_result: + # if ydl.is_youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_attempts.labels(ydl.is_youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -344,7 +343,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ydl.url))) ) - metrics.brozzler_ydl_extract_successes.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -370,18 +369,20 @@ def _try_youtube_dl(worker, ydl, site, page): logging.warning( "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) - raise brozzler.ProxyError( - "yt-dlp hit possible external proxy error from %s" % ydl.url + raise brozzler.VideoExtractorError( + "yt-dlp hit error extracting info for %s" % ydl.url ) else: + retry_wait = min(60, YTDLP_WAIT * (1.5**(attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, - YTDLP_WAIT, + retry_wait, ) - time.sleep(YTDLP_WAIT) + time.sleep(retry_wait) else: - raise brozzler.ProxyError("Proxy attempt(s) failed for unknown reason(s)") + raise brozzler.VideoExtractorError("yt-dlp hit unknown error extracting info for %s" % ydl.url) + logging.info("ytdlp completed successfully") _remember_videos(page, ydl.pushed_videos)