From 7de648345360b2799c7bd89346e2133a4b486262 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 20 Sep 2024 14:43:05 -0700 Subject: [PATCH 1/5] skip brozzler_resources metrics --- brozzler/metrics.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index ad5737a..81239bc 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -22,10 +22,6 @@ brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_ brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") -brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) -brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) -brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) -brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"]) brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"]) From 3e9030a376010583ec12e5c982d5d2bfa53da54c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 15:16:07 -0700 Subject: [PATCH 2/5] rm oddly merged try except block --- brozzler/ydl.py | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ae56fa7..ebfadce 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -184,31 +184,29 @@ def _build_youtube_dl(worker, destdir, site, page): worker._proxy_for(site), url, ) - try: - with open(info_dict["filepath"], "rb") as f: - # include content-length header to avoid chunked - # transfer, which warcprox currently rejects - extra_headers = dict(site.extra_headers()) - extra_headers["content-length"] = size - request, response = worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url=url, - warc_type="resource", - content_type=mimetype, - payload=f, - extra_headers=extra_headers, - ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } + with open(info_dict["filepath"], "rb") as f: + # include content-length header to avoid chunked + # transfer, which warcprox currently rejects + extra_headers = dict(site.extra_headers()) + extra_headers["content-length"] = size + request, response = worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url=url, + warc_type="resource", + content_type=mimetype, + payload=f, + extra_headers=extra_headers, ) - except: - traceback.print_exc() + + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed From fb43d3f2a49007d8bbd0ae479930b03887005a18 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 15:20:29 -0700 Subject: [PATCH 3/5] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2dc8928..4b556f1 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.55a3", + version="1.5.55", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 6a0b0b058d4e1860073b3abd7bd9ab1e481cee04 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 18:37:14 -0700 Subject: [PATCH 4/5] updates post-walkthru --- brozzler/__init__.py | 4 ++++ brozzler/cli.py | 12 +++++------ brozzler/worker.py | 14 ++++++++++--- brozzler/ydl.py | 49 ++++++++++++++++++++++---------------------- 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 5040e69..7dd284d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -39,6 +39,10 @@ class PageInterstitialShown(Exception): pass +class VideoExtractorError(Exception): + pass + + class ProxyError(Exception): pass diff --git a/brozzler/cli.py b/brozzler/cli.py index 11226fa..3cb7c9a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -238,7 +238,7 @@ def brozzle_page(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -251,7 +251,7 @@ def brozzle_page(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" @@ -298,7 +298,7 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) @@ -543,7 +543,7 @@ def brozzler_worker(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -556,7 +556,7 @@ def brozzler_worker(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) add_common_options(arg_parser, argv) @@ -614,7 +614,7 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) diff --git a/brozzler/worker.py b/brozzler/worker.py index e4f984d..c780644 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -72,7 +72,7 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, - metrics_port=None, + metrics_port=0, registry_url=None, env=None, ): @@ -111,8 +111,11 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() - # Setup metrics - metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + # set up metrics + if self._metrics_port > 0: + metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + else: + logging.warning("not starting prometheus scrape endpoint: metrics_port is undefined") def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") @@ -284,6 +287,11 @@ class BrozzlerWorker: raise except brozzler.ProxyError: raise + except brozzler.VideoExtractorError as e: + logging.error( + "error extracting video info: %s", + e, + ) except Exception as e: if ( hasattr(e, "exc_info") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ebfadce..b1266fd 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -71,6 +71,11 @@ def should_ytdlp(site, page, page_status, skip_av_seeds): return True +def isyoutubehost(url): + # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname + return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] + + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers @@ -237,14 +242,9 @@ def _build_youtube_dl(worker, destdir, site, page): worker.logger.info( "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) ) - youtube_host = ( - "youtube.com" - in d["info_dict"]["webpage_url"] - .split("//")[-1] - .split("/")[0] - .split("?")[0] - ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"]) + + metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1) if worker._using_warcprox(site): _YoutubeDL._push_video_to_warcprox( _YoutubeDL, site, d["info_dict"], d["postprocessor"] @@ -283,15 +283,14 @@ def _build_youtube_dl(worker, destdir, site, page): } ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = ( - "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] - ) - if youtube_host and YTDLP_PROXY: + is_youtube_host = isyoutubehost(ytdlp_url) + if is_youtube_host and YTDLP_PROXY: ydl_opts["proxy"] = YTDLP_PROXY - ytdlp_proxy_for_print = ( + # don't log proxy value secrets + ytdlp_proxy_for_logs = ( YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" ) - logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_print) + logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges # if worker._proxy_for(site): @@ -302,7 +301,7 @@ def _build_youtube_dl(worker, destdir, site, page): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] ydl.url = ytdlp_url - ydl.youtube_host = youtube_host + ydl.is_youtube_host = is_youtube_host return ydl @@ -330,12 +329,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ydl.url) - # should_download_vid = not ydl.youtube_host + # should_download_vid = not ydl.is_youtube_host # then # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) - # if ydl.youtube_host and ie_result: + # if ydl.is_youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_attempts.labels(ydl.is_youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -344,7 +343,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ydl.url))) ) - metrics.brozzler_ydl_extract_successes.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -370,18 +369,20 @@ def _try_youtube_dl(worker, ydl, site, page): logging.warning( "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) - raise brozzler.ProxyError( - "yt-dlp hit possible external proxy error from %s" % ydl.url + raise brozzler.VideoExtractorError( + "yt-dlp hit error extracting info for %s" % ydl.url ) else: + retry_wait = min(60, YTDLP_WAIT * (1.5**(attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, - YTDLP_WAIT, + retry_wait, ) - time.sleep(YTDLP_WAIT) + time.sleep(retry_wait) else: - raise brozzler.ProxyError("Proxy attempt(s) failed for unknown reason(s)") + raise brozzler.VideoExtractorError("yt-dlp hit unknown error extracting info for %s" % ydl.url) + logging.info("ytdlp completed successfully") _remember_videos(page, ydl.pushed_videos) From 9983f43c7508cb469b9757060c785f678eba18ab Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 19:53:03 -0700 Subject: [PATCH 5/5] black'd --- brozzler/worker.py | 8 ++++++-- brozzler/ydl.py | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index c780644..6eb5872 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -113,9 +113,13 @@ class BrozzlerWorker: # set up metrics if self._metrics_port > 0: - metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + metrics.register_prom_metrics( + self._metrics_port, self._registry_url, self._env + ) else: - logging.warning("not starting prometheus scrape endpoint: metrics_port is undefined") + logging.warning( + "not starting prometheus scrape endpoint: metrics_port is undefined" + ) def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b1266fd..f3f21be 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -373,7 +373,7 @@ def _try_youtube_dl(worker, ydl, site, page): "yt-dlp hit error extracting info for %s" % ydl.url ) else: - retry_wait = min(60, YTDLP_WAIT * (1.5**(attempt - 1))) + retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, @@ -381,7 +381,9 @@ def _try_youtube_dl(worker, ydl, site, page): ) time.sleep(retry_wait) else: - raise brozzler.VideoExtractorError("yt-dlp hit unknown error extracting info for %s" % ydl.url) + raise brozzler.VideoExtractorError( + "yt-dlp hit unknown error extracting info for %s" % ydl.url + ) logging.info("ytdlp completed successfully")