From 4a60ff3367b5a4c8606c3447ee3027d6cf0df4be Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 17:01:35 -0700 Subject: [PATCH 1/4] post-deploy bug fixes --- brozzler/cli.py | 6 +++--- brozzler/metrics.py | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 1db25b0..431b57e 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -297,7 +297,7 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - metrics_port=args.metrics_port, + metrics_port=int(args.metrics_port), registry_url=args.registry_url, env=args.env, ) @@ -540,7 +540,7 @@ def brozzler_worker(argv=None): ) arg_parser.add_argument( "--metrics_port", - dest=metrics_port, + dest="metrics_port", default=8888, help="Prometheus metrics port", ) @@ -612,7 +612,7 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - metrics_port=args.metrics_port, + metrics_port=int(args.metrics_port), registry_url=args.registry_url, env=args.env, ) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 3698de5..adf0184 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -42,12 +42,18 @@ def register_prom_metrics( if registry_url is None: return + env_for_prom = None + if env == "qa": + env_for_prom = Env.qa + elif env == "prod": + env_for_prom = Env.prod + config = ClientConfig(server_url_base=registry_url) client = Client(config) target = format_self_target(scrape_port=metrics_port) registration = Registration( target=target, - env=env, + env=env_for_prom, scheme=Scheme.http, ) client.keep_registered_threaded(registration) From bb1c3437248b612cd5fbecebf1bed0a5b80205d0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 17 Sep 2024 17:53:50 -0700 Subject: [PATCH 2/4] updates for review of PR 287 --- brozzler/cli.py | 10 ++++++---- brozzler/metrics.py | 10 ++++++---- brozzler/ydl.py | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 431b57e..11226fa 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -236,15 +236,16 @@ def brozzle_page(argv=None): ) arg_parser.add_argument( "--metrics_port", + type=int, dest="metrics_port", default=8888, - help="Prometheus metrics port", + help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( "--registry_url", dest="registry_url", default=None, - help="Prometheus scrape target registry URL", + help="http-sd-registry url, for Prometheus metrics discovery", ) arg_parser.add_argument( "--env", @@ -540,15 +541,16 @@ def brozzler_worker(argv=None): ) arg_parser.add_argument( "--metrics_port", + type=int, dest="metrics_port", default=8888, - help="Prometheus metrics port", + help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( "--registry_url", dest="registry_url", default=None, - help="Prometheus scrape target registry URL", + help="http-sd-registry url, for Prometheus metrics discovery", ) arg_parser.add_argument( "--env", diff --git a/brozzler/metrics.py b/brozzler/metrics.py index adf0184..5ff36ed 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -10,6 +10,7 @@ try: ) from http_sd_registry.config import ClientConfig except ImportError: + # for users without access to http_sd_registry http_sd_registry = None @@ -19,15 +20,15 @@ from prometheus_client import Counter, Gauge, Histogram, start_http_server brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") -brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") +brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) -brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on @@ -42,11 +43,12 @@ def register_prom_metrics( if registry_url is None: return - env_for_prom = None if env == "qa": env_for_prom = Env.qa elif env == "prod": env_for_prom = Env.prod + else: + env_for_prom = Env.qa config = ClientConfig(server_url_base=registry_url) client = Client(config) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 81c1ec4..392bb2d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -303,12 +303,12 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] + youtube_host = "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] attempt = 0 while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ytdlp_url) - metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1) + metrics.brozzler_ydl_download_attempts.labels(youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -317,7 +317,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - metrics.brozzler_ydl_download_successes.labels(ytdlp_host).inc(1) + metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise From 8b2c254485afee2c021299d3caa53633e22cef76 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 14:17:10 -0700 Subject: [PATCH 3/4] brozzler_ydl_extract, not download --- brozzler/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 5ff36ed..4b1277d 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -27,8 +27,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) -brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_extract_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_extract_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on From 62b12434d0a0d626a756e9fa70a14d5a5f7cce52 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 14:51:11 -0700 Subject: [PATCH 4/4] mostly ydl.py updates for new proxyrack testing --- brozzler/ydl.py | 82 +++++++++++++++++-------------------------------- 1 file changed, 28 insertions(+), 54 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 392bb2d..7368534 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,7 @@ import time thread_local = threading.local() PROXYRACK_PROXY = "@@@" -MAX_YTDLP_ATTEMPTS = 4 +MAX_YTDLP_ATTEMPTS = 3 YTDLP_WAIT = 10 @@ -197,15 +197,15 @@ def _build_youtube_dl(worker, destdir, site, page): payload=f, extra_headers=extra_headers, ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } - ) + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed @@ -268,6 +268,9 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, + # does this make sense when we're generally downloading one at a time? + "sleep_interval": 25, + "max_sleep_interval": 90, "proxy": PROXYRACK_PROXY, } @@ -308,7 +311,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ytdlp_url) - metrics.brozzler_ydl_download_attempts.labels(youtube_host).inc(1) + # should_download_vid = not youtube_host + # then + # ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)), download=should_download_vid) + # if youtube_host and ie_result: + # download_url = ie_result.get("url") + metrics.brozzler_ydl_extract_attempts.labels(youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -317,7 +325,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -366,48 +374,14 @@ def _try_youtube_dl(worker, ydl, site, page): "with yt-dlp json for %s", ytdlp_url, ) - - attempt = 0 - while attempt < MAX_YTDLP_ATTEMPTS: - try: - worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), - warc_type="metadata", - content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers(page), - ) - break - except Exception as e: - # connection problem when using a proxy == proxy error - if ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.URLError - and worker._proxy_for(site) - ): - attempt += 1 - if attempt == MAX_YTDLP_ATTEMPTS: - logging.warning( - "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e - ) - raise brozzler.ProxyError( - "yt-dlp hit proxy error storing media from %s with " - % ytdlp_url - ) - else: - logging.info( - "Attempt %s failed. Retrying in %s seconds...", - attempt, - YTDLP_WAIT, - ) - time.sleep(YTDLP_WAIT) - else: - raise - else: - raise brozzler.ProxyError( - "Proxy attempt(s) storing media failed for unknown reason(s)" - ) + worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + warc_type="metadata", + content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers(page), + ) return ie_result