From edf2f9e51a001fdcabe95a94f6a1e15cc096232f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 10 Sep 2024 21:58:47 -0700 Subject: [PATCH 01/55] retries for proxyrack and push --- brozzler/ydl.py | 157 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 108 insertions(+), 49 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b292129..092dab7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -28,9 +28,14 @@ import json import doublethink import datetime import threading +import time thread_local = threading.local() +PROXYRACK_PROXY = "@@@" +MAX_YTDLP_ATTEMPTS = 4 +YTDLP_WAIT = 10 + def should_ytdlp(site, page, page_status, skip_av_seeds): # called only after we've passed needs_browsing() check @@ -262,6 +267,7 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, + "proxy": PROXYRACK_PROXY, } # skip proxying yt-dlp v.2023.07.06 @@ -296,57 +302,110 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - try: - logging.info("trying yt-dlp on %s", ytdlp_url) + attempt = 0 + while attempt < MAX_YTDLP_ATTEMPTS: + try: + logging.info("trying yt-dlp on %s", ytdlp_url) - with brozzler.thread_accept_exceptions(): - # we do whatwg canonicalization here to avoid "" resulting in ProxyError - # needs automated test - # and yt-dlp needs sanitize_info for extract_info - ie_result = ydl.sanitize_info( - ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) - ) - _remember_videos(page, ydl.pushed_videos) - if worker._using_warcprox(site): - info_json = json.dumps(ie_result, sort_keys=True, indent=4) - logging.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " - "with yt-dlp json for %s", - ytdlp_url, - ) - worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), - warc_type="metadata", - content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers(page), - ) - return ie_result - except brozzler.ShutdownRequested as e: - raise - except Exception as e: - if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError: - return None - elif ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.HTTPError - and hasattr(e.exc_info[1], "code") - and e.exc_info[1].code == 420 - ): - raise brozzler.ReachedLimit(e.exc_info[1]) - elif ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.URLError - and worker._proxy_for(site) - ): - # connection problem when using a proxy == proxy error (XXX?) - raise brozzler.ProxyError( - "yt-dlp hit apparent proxy error from " "%s" % ytdlp_url - ) from e - else: + with brozzler.thread_accept_exceptions(): + # we do whatwg canonicalization here to avoid "" resulting in ProxyError + # needs automated test + # and yt-dlp needs sanitize_info for extract_info + ie_result = ydl.sanitize_info( + ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) + ) + break + except brozzler.ShutdownRequested as e: raise + except Exception as e: + if ( + hasattr(e, "exc_info") + and e.exc_info[0] == yt_dlp.utils.UnsupportedError + ): + return None + elif ( + hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.HTTPError + and hasattr(e.exc_info[1], "code") + and e.exc_info[1].code == 420 + ): + raise brozzler.ReachedLimit(e.exc_info[1]) + else: + # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) + # and others... + attempt += 1 + if attempt == MAX_YTDLP_ATTEMPTS: + logging.warning( + "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e + ) + raise brozzler.ProxyError( + "yt-dlp hit proxyrack proxy error from %s" % ytdlp_url + ) + else: + logging.info( + "Attempt %s failed. Retrying in %s seconds...", + attempt, + YTDLP_WAIT, + ) + time.sleep(YTDLP_WAIT) + else: + raise brozzler.ProxyError( + "Proxyrack proxy attempt(s) failed for unknown reason(s)" + ) + logging.info("ytdlp completed successfully") + + _remember_videos(page, ydl.pushed_videos) + if worker._using_warcprox(site): + info_json = json.dumps(ie_result, sort_keys=True, indent=4) + logging.info( + "sending WARCPROX_WRITE_RECORD request to warcprox " + "with yt-dlp json for %s", + ytdlp_url, + ) + + attempt = 0 + while attempt < MAX_YTDLP_ATTEMPTS: + try: + worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + warc_type="metadata", + content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers(page), + ) + break + except Exception as e: + # connection problem when using a proxy == proxy error + if ( + hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.URLError + and worker._proxy_for(site) + ): + attempt += 1 + if attempt == MAX_YTDLP_ATTEMPTS: + logging.warning( + "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e + ) + raise brozzler.ProxyError( + "yt-dlp hit proxy error storing media from %s with " + % ytdlp_url + ) + else: + logging.info( + "Attempt %s failed. Retrying in %s seconds...", + attempt, + YTDLP_WAIT, + ) + time.sleep(YTDLP_WAIT) + else: + raise + else: + raise brozzler.ProxyError( + "Proxy attempt(s) storing media failed for unknown reason(s)" + ) + return ie_result def do_youtube_dl(worker, site, page): From 2d7efba28074e029ff18084a254a7d7944ad58b2 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 11 Sep 2024 16:36:08 -0700 Subject: [PATCH 02/55] initial commit --- brozzler/metrics.py | 48 +++++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 ++ 2 files changed, 50 insertions(+) create mode 100644 brozzler/metrics.py diff --git a/brozzler/metrics.py b/brozzler/metrics.py new file mode 100644 index 0000000..5203c51 --- /dev/null +++ b/brozzler/metrics.py @@ -0,0 +1,48 @@ +from typing import Optional + +""" +from http_sd_registry.client import ( + Client, + Env, + Registration, + Scheme, + format_self_target, +) +from http_sd_registry.config import ClientConfig +""" +from prometheus_client import Counter, Gauge, Histogram, start_http_server + +# fmt: off +brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") +brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") +brozzler_outlinks_found = Counter("brozzler_urls_found", "number of outlinks found by brozzler") +brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") +brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") +brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) +brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) +brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) +brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) +brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") +brozzler_ydl_download_attempts= Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp") +brozzler_ydl_download_successes= Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp") +# fmt: on + + +def register_prom_metrics( + registry_url: Optional[str] = None, metrics_port: int = 8888, env: Env = Env.qa +): + # Start metrics endpoint for scraping + start_http_server(metrics_port) + + if registry_url is None: + return + + config = ClientConfig(server_url_base=registry_url) + client = Client(config) + target = format_self_target(scrape_port=metrics_port) + registration = Registration( + target=target, + env=env, + scheme=Scheme.http, + ) + client.keep_registered_threaded(registration) diff --git a/setup.py b/setup.py index c275b2b..edca942 100644 --- a/setup.py +++ b/setup.py @@ -77,6 +77,8 @@ setuptools.setup( "jinja2>=2.10", "cryptography>=2.3", "python-magic>=0.4.15", + "prometheus-client>=0.20.0", + "prometheus-async>=22.2.0", ], extras_require={ "yt-dlp": ["yt-dlp==2024.7.25"], From 7de5b1cbd7799331fe4df23992852910dc6d3062 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 11 Sep 2024 17:37:55 -0700 Subject: [PATCH 03/55] add some labels --- brozzler/metrics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 5203c51..65c4004 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -13,9 +13,9 @@ from http_sd_registry.config import ClientConfig from prometheus_client import Counter, Gauge, Histogram, start_http_server # fmt: off -brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") -brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") -brozzler_outlinks_found = Counter("brozzler_urls_found", "number of outlinks found by brozzler") +brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler", labelnames=["host"]) +brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler", labelnames=["host"]) +brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler", labelnames=["host"]) brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) @@ -23,8 +23,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts= Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp") -brozzler_ydl_download_successes= Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp") +brozzler_ydl_download_attempts= Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_successes= Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) # fmt: on From 1d63793788a90c4e45a52ad85850c8f7fc7442c6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Sep 2024 11:32:06 -0700 Subject: [PATCH 04/55] add update_page_metrics --- brozzler/worker.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index 479dfa7..6cf5836 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -35,6 +35,7 @@ import tempfile import urlcanon from requests.structures import CaseInsensitiveDict import rethinkdb as rdb +from . import metrics from . import ydl r = rdb.RethinkDB() @@ -312,6 +313,12 @@ class BrozzlerWorker: return True def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): + def update_page_metrics(page, outlinks): + """Update page-level Prometheus metrics.""" + metrics.brozzler_last_page_crawled_time.set_to_current_time() + metrics.brozzler_pages_crawled.inc(1) + metrics.brozzler_outlinks_found.inc(len(outlinks)) + def _on_screenshot(screenshot_jpeg): if on_screenshot: on_screenshot(screenshot_jpeg) @@ -416,6 +423,7 @@ class BrozzlerWorker: ) if final_page_url != page.url: page.note_redirect(final_page_url) + update_page_metrics(page, outlinks) return outlinks def _fetch_url(self, site, url=None, page=None): From 1d84e72ce77ad871fd096a5a4ce6cec8c0f2e09c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Sep 2024 13:10:26 -0700 Subject: [PATCH 05/55] update setup.py: skip prometheus-async; update ytdlp --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index edca942..f3698a9 100644 --- a/setup.py +++ b/setup.py @@ -78,10 +78,9 @@ setuptools.setup( "cryptography>=2.3", "python-magic>=0.4.15", "prometheus-client>=0.20.0", - "prometheus-async>=22.2.0", ], extras_require={ - "yt-dlp": ["yt-dlp==2024.7.25"], + "yt-dlp": ["yt-dlp>=2024.7.25"], "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], "easy": [ "warcprox>=2.4.31", From c82858a9f6f4958fb813563683073a6c399f93f8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Sep 2024 13:11:41 -0700 Subject: [PATCH 06/55] make http_sd_registry optional --- brozzler/metrics.py | 56 +++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 65c4004..8f9d87a 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -1,15 +1,17 @@ from typing import Optional -""" -from http_sd_registry.client import ( - Client, - Env, - Registration, - Scheme, - format_self_target, -) -from http_sd_registry.config import ClientConfig -""" +try: + from http_sd_registry.client import ( + Client, + Env, + Registration, + Scheme, + format_self_target, + ) + from http_sd_registry.config import ClientConfig +except ImportError: + http_sd_registry = None + from prometheus_client import Counter, Gauge, Histogram, start_http_server # fmt: off @@ -27,22 +29,22 @@ brozzler_ydl_download_attempts= Counter("brozzler_ydl_download_attempts", "count brozzler_ydl_download_successes= Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) # fmt: on +if http_sd_registry: + def register_prom_metrics( + registry_url: Optional[str] = None, metrics_port: int = 8888, env: Env = Env.qa + ): + # Start metrics endpoint for scraping + start_http_server(metrics_port) -def register_prom_metrics( - registry_url: Optional[str] = None, metrics_port: int = 8888, env: Env = Env.qa -): - # Start metrics endpoint for scraping - start_http_server(metrics_port) + if registry_url is None: + return - if registry_url is None: - return - - config = ClientConfig(server_url_base=registry_url) - client = Client(config) - target = format_self_target(scrape_port=metrics_port) - registration = Registration( - target=target, - env=env, - scheme=Scheme.http, - ) - client.keep_registered_threaded(registration) + config = ClientConfig(server_url_base=registry_url) + client = Client(config) + target = format_self_target(scrape_port=metrics_port) + registration = Registration( + target=target, + env=env, + scheme=Scheme.http, + ) + client.keep_registered_threaded(registration) From 66827fbbcf9f669ca94450a44e72c7bea9cb1eb8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Sep 2024 14:27:22 -0700 Subject: [PATCH 07/55] more metrics --- brozzler/metrics.py | 10 +++++----- brozzler/worker.py | 2 ++ brozzler/ydl.py | 5 ++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 8f9d87a..36ece6e 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -15,9 +15,9 @@ except ImportError: from prometheus_client import Counter, Gauge, Histogram, start_http_server # fmt: off -brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler", labelnames=["host"]) -brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler", labelnames=["host"]) -brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler", labelnames=["host"]) +brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") +brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") +brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) @@ -25,8 +25,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts= Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) -brozzler_ydl_download_successes= Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) # fmt: on if http_sd_registry: diff --git a/brozzler/worker.py b/brozzler/worker.py index 6cf5836..11f0efb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -267,6 +267,7 @@ class BrozzlerWorker: ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) + metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: raise @@ -312,6 +313,7 @@ class BrozzlerWorker: return False return True + @metrics.brozzler_page_processing_duration_seconds.time() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def update_page_metrics(page, outlinks): """Update page-level Prometheus metrics.""" diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b292129..2f07e9d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -27,6 +27,7 @@ import os import json import doublethink import datetime +from . import metrics import threading thread_local = threading.local() @@ -296,9 +297,10 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url + ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split('?')[0] try: logging.info("trying yt-dlp on %s", ytdlp_url) - + metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -307,6 +309,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) + metrics.brozzler_ydl_download_successes.labels(ytdlp_host).inc(1) _remember_videos(page, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) From 80ce6c0ea6cecda73cae33d45636c7abb504a544 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Sep 2024 16:12:18 -0700 Subject: [PATCH 08/55] register_prom_metrics working in dev env --- brozzler/metrics.py | 52 ++++++++++++++++++++++++++++++--------------- brozzler/worker.py | 13 ++++++++++++ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 36ece6e..9fa3321 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -11,6 +11,24 @@ try: from http_sd_registry.config import ClientConfig except ImportError: http_sd_registry = None + Client = None + + import enum # type: ignore + + class Env(str, enum.Enum): + """Values of the Prometheus ``env`` label applied to a + :py:class:`.Registration` indicating the deployment environment in which + the the service being advertised is operating. + """ + + qa = "qa" + prod = "prod" + dev = "dev" + + Registration = None + Scheme = None + format_self_target = None + ClientConfig = None from prometheus_client import Counter, Gauge, Histogram, start_http_server @@ -29,22 +47,22 @@ brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "coun brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) # fmt: on -if http_sd_registry: - def register_prom_metrics( - registry_url: Optional[str] = None, metrics_port: int = 8888, env: Env = Env.qa - ): - # Start metrics endpoint for scraping - start_http_server(metrics_port) - if registry_url is None: - return +def register_prom_metrics( + registry_url: Optional[str] = None, metrics_port: int = 8888, env: Env = Env.qa +): + # Start metrics endpoint for scraping + start_http_server(metrics_port) - config = ClientConfig(server_url_base=registry_url) - client = Client(config) - target = format_self_target(scrape_port=metrics_port) - registration = Registration( - target=target, - env=env, - scheme=Scheme.http, - ) - client.keep_registered_threaded(registration) + if registry_url is None: + return + + config = ClientConfig(server_url_base=registry_url) + client = Client(config) + target = format_self_target(scrape_port=metrics_port) + registration = Registration( + target=target, + env=env, + scheme=Scheme.http, + ) + client.keep_registered_threaded(registration) diff --git a/brozzler/worker.py b/brozzler/worker.py index 11f0efb..3f3c04a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -28,6 +28,7 @@ import json import PIL.Image import io import socket +import platform import random import requests import doublethink @@ -41,6 +42,18 @@ from . import ydl r = rdb.RethinkDB() +# Setup metrics +registry_url = None +metrics_port = 8090 +env = metrics.Env.dev +hostname = platform.node() +if hostname.endswith("archive.org"): + registry_url = "http://wbgrp-svc283.us.archive.org:8888" + metrics_port = settings.metrics_port + env = metrics.Env.qa +metrics.register_prom_metrics(registry_url, metrics_port, env) + + class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) From 7b6c306d14fe90e63521b38cad666d229a4dda02 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Sep 2024 17:54:37 -0700 Subject: [PATCH 09/55] setup registry_url, metrics_port, env vars and CLI args --- brozzler/cli.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ brozzler/metrics.py | 2 +- brozzler/worker.py | 25 ++++++++++---------- brozzler/ydl.py | 2 +- 4 files changed, 70 insertions(+), 15 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index bea5153..ba6b702 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -165,6 +165,20 @@ class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter return super()._get_help_string(action) +import enum + + +class Env(str, enum.Enum): + """Values of the Prometheus ``env`` label applied to a + :py:class:`.Registration` indicating the deployment environment in which + the service being advertised is operating. + """ + + qa = "qa" + prod = "prod" + dev = "dev" + + def brozzle_page(argv=None): """ Command line utility entry point for brozzling a single page. Opens url in @@ -234,6 +248,24 @@ def brozzle_page(argv=None): action="store_true", help="Try to avoid web bot detection", ) + arg_parser.add_argument( + "--registry_url", + dest="registry_url", + default=None, + help="Prometheus registry url", + ) + arg_parser.add_argument( + "--metrics_port", + dest=metrics_port, + default=8889, + help="Prometheus metrics port", + ) + arg_parser.add_argument( + "--env", + dest=env, + default=Env.dev, + help="Prometheus env value", + ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" ) @@ -279,6 +311,9 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, + registry_url=args.registry_url, + metrics_port=args.metrics_port, + env=args.env, ) def on_screenshot(screenshot_jpeg): @@ -517,6 +552,24 @@ def brozzler_worker(argv=None): action="store_true", help="Try to avoid web bot detection", ) + arg_parser.add_argument( + "--registry_url", + dest="registry_url", + default=None, + help="Prometheus registry url", + ) + arg_parser.add_argument( + "--metrics_port", + dest=metrics_port, + default=8888, + help="Prometheus metrics port", + ) + arg_parser.add_argument( + "--env", + dest=env, + default=Env.qa, + help="Prometheus env value", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -573,6 +626,9 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, + registry_url=args.registry_url, + metrics_port=args.metrics_port, + env=args.env, ) signal.signal(signal.SIGQUIT, dump_state) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 9fa3321..cbfa768 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -18,7 +18,7 @@ except ImportError: class Env(str, enum.Enum): """Values of the Prometheus ``env`` label applied to a :py:class:`.Registration` indicating the deployment environment in which - the the service being advertised is operating. + the service being advertised is operating. """ qa = "qa" diff --git a/brozzler/worker.py b/brozzler/worker.py index 3f3c04a..0d221c6 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -28,7 +28,6 @@ import json import PIL.Image import io import socket -import platform import random import requests import doublethink @@ -42,18 +41,6 @@ from . import ydl r = rdb.RethinkDB() -# Setup metrics -registry_url = None -metrics_port = 8090 -env = metrics.Env.dev -hostname = platform.node() -if hostname.endswith("archive.org"): - registry_url = "http://wbgrp-svc283.us.archive.org:8888" - metrics_port = settings.metrics_port - env = metrics.Env.qa -metrics.register_prom_metrics(registry_url, metrics_port, env) - - class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -85,6 +72,9 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, + registry_url=None, + metrics_port=None, + env=None, ): self._frontier = frontier self._service_registry = service_registry @@ -107,6 +97,9 @@ class BrozzlerWorker: self._window_height = window_height self._window_width = window_width self._stealth = stealth + self._registry_url = registry_url + self._metrics_port = metrics_port + self._env = env self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True @@ -118,6 +111,12 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() + # Setup metrics + registry_url = self._registry_url + metrics_port = self._metrics_port + env = self._env + metrics.register_prom_metrics(registry_url, metrics_port, env) + def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") if not warcproxes: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2f07e9d..dcae69b 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -297,7 +297,7 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split('?')[0] + ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] try: logging.info("trying yt-dlp on %s", ytdlp_url) metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1) From 66096453d175b24818417078b7eb5c9212d86c25 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 09:13:50 -0700 Subject: [PATCH 10/55] tidy params for register_prom_metrics --- brozzler/cli.py | 52 +++++++++++---------------------------------- brozzler/metrics.py | 21 +++--------------- brozzler/worker.py | 5 +---- 3 files changed, 16 insertions(+), 62 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index ba6b702..5bab386 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -165,20 +165,6 @@ class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter return super()._get_help_string(action) -import enum - - -class Env(str, enum.Enum): - """Values of the Prometheus ``env`` label applied to a - :py:class:`.Registration` indicating the deployment environment in which - the service being advertised is operating. - """ - - qa = "qa" - prod = "prod" - dev = "dev" - - def brozzle_page(argv=None): """ Command line utility entry point for brozzling a single page. Opens url in @@ -248,23 +234,17 @@ def brozzle_page(argv=None): action="store_true", help="Try to avoid web bot detection", ) - arg_parser.add_argument( - "--registry_url", - dest="registry_url", - default=None, - help="Prometheus registry url", - ) arg_parser.add_argument( "--metrics_port", - dest=metrics_port, - default=8889, + dest="metrics_port", + default=8888, help="Prometheus metrics port", ) arg_parser.add_argument( - "--env", - dest=env, - default=Env.dev, - help="Prometheus env value", + "--registry_url", + dest="registry_url", + default=None, + help="registry url", ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" @@ -311,9 +291,8 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - registry_url=args.registry_url, metrics_port=args.metrics_port, - env=args.env, + registry_url=args.registry_url, ) def on_screenshot(screenshot_jpeg): @@ -552,12 +531,6 @@ def brozzler_worker(argv=None): action="store_true", help="Try to avoid web bot detection", ) - arg_parser.add_argument( - "--registry_url", - dest="registry_url", - default=None, - help="Prometheus registry url", - ) arg_parser.add_argument( "--metrics_port", dest=metrics_port, @@ -565,10 +538,10 @@ def brozzler_worker(argv=None): help="Prometheus metrics port", ) arg_parser.add_argument( - "--env", - dest=env, - default=Env.qa, - help="Prometheus env value", + "--registry_url", + dest="registry_url", + default=None, + help="registry url", ) add_common_options(arg_parser, argv) @@ -626,9 +599,8 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - registry_url=args.registry_url, metrics_port=args.metrics_port, - env=args.env, + registry_url=args.registry_url, ) signal.signal(signal.SIGQUIT, dump_state) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index cbfa768..3698de5 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -11,24 +11,7 @@ try: from http_sd_registry.config import ClientConfig except ImportError: http_sd_registry = None - Client = None - import enum # type: ignore - - class Env(str, enum.Enum): - """Values of the Prometheus ``env`` label applied to a - :py:class:`.Registration` indicating the deployment environment in which - the service being advertised is operating. - """ - - qa = "qa" - prod = "prod" - dev = "dev" - - Registration = None - Scheme = None - format_self_target = None - ClientConfig = None from prometheus_client import Counter, Gauge, Histogram, start_http_server @@ -49,7 +32,9 @@ brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "co def register_prom_metrics( - registry_url: Optional[str] = None, metrics_port: int = 8888, env: Env = Env.qa + metrics_port: int = 8888, + registry_url: Optional[str] = None, + env: Optional[str] = None, ): # Start metrics endpoint for scraping start_http_server(metrics_port) diff --git a/brozzler/worker.py b/brozzler/worker.py index 0d221c6..ae5f0c6 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -112,10 +112,7 @@ class BrozzlerWorker: self._shutdown = threading.Event() # Setup metrics - registry_url = self._registry_url - metrics_port = self._metrics_port - env = self._env - metrics.register_prom_metrics(registry_url, metrics_port, env) + metrics.register_prom_metrics(self._metrics_port, self._registry_url) def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") From 1bc2135462755b2a50d9e308cc4615c61ba51ad8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 13:12:02 -0700 Subject: [PATCH 11/55] better registry_url help --- brozzler/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 5bab386..0ae7993 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -244,7 +244,7 @@ def brozzle_page(argv=None): "--registry_url", dest="registry_url", default=None, - help="registry url", + help="Prometheus scrape target registry URL", ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" @@ -541,7 +541,7 @@ def brozzler_worker(argv=None): "--registry_url", dest="registry_url", default=None, - help="registry url", + help="Prometheus scrape target registry URL", ) add_common_options(arg_parser, argv) From 9d7b7c4167bb3c41fa9cf407f38c4518c774c023 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 13:15:57 -0700 Subject: [PATCH 12/55] add @metrics.brozzler_in_progress_pages.track_in_progress() --- brozzler/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index ae5f0c6..d470bd7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -323,6 +323,7 @@ class BrozzlerWorker: return True @metrics.brozzler_page_processing_duration_seconds.time() + @metrics.brozzler_in_progress_pages.track_in_progress() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def update_page_metrics(page, outlinks): """Update page-level Prometheus metrics.""" From 8c20d1dad3c1fe687dba5aceb1ba5f3e12aedda5 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 14:09:06 -0700 Subject: [PATCH 13/55] add (back) env param --- brozzler/cli.py | 14 ++++++++++++++ brozzler/worker.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 0ae7993..1db25b0 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -246,6 +246,12 @@ def brozzle_page(argv=None): default=None, help="Prometheus scrape target registry URL", ) + arg_parser.add_argument( + "--env", + dest="env", + default=None, + help="env for Prometheus target registry", + ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" ) @@ -293,6 +299,7 @@ def brozzle_page(argv=None): stealth=args.stealth, metrics_port=args.metrics_port, registry_url=args.registry_url, + env=args.env, ) def on_screenshot(screenshot_jpeg): @@ -543,6 +550,12 @@ def brozzler_worker(argv=None): default=None, help="Prometheus scrape target registry URL", ) + arg_parser.add_argument( + "--env", + dest="env", + default=None, + help="env for Prometheus target registry", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -601,6 +614,7 @@ def brozzler_worker(argv=None): stealth=args.stealth, metrics_port=args.metrics_port, registry_url=args.registry_url, + env=args.env, ) signal.signal(signal.SIGQUIT, dump_state) diff --git a/brozzler/worker.py b/brozzler/worker.py index d470bd7..2a63dec 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -112,7 +112,7 @@ class BrozzlerWorker: self._shutdown = threading.Event() # Setup metrics - metrics.register_prom_metrics(self._metrics_port, self._registry_url) + metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") From 1a6aeb45dbb929b8d1ab8c471fa5abd697721084 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 14:12:46 -0700 Subject: [PATCH 14/55] track_inprogress (not _in_progress) --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 2a63dec..67d627e 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -323,7 +323,7 @@ class BrozzlerWorker: return True @metrics.brozzler_page_processing_duration_seconds.time() - @metrics.brozzler_in_progress_pages.track_in_progress() + @metrics.brozzler_in_progress_pages.track_inprogress() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def update_page_metrics(page, outlinks): """Update page-level Prometheus metrics.""" From 551d1868d10fd91d5c8bca33782f0710370ab078 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 14:16:49 -0700 Subject: [PATCH 15/55] better order --- brozzler/worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 67d627e..e4f984d 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -72,8 +72,8 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, - registry_url=None, metrics_port=None, + registry_url=None, env=None, ): self._frontier = frontier @@ -97,8 +97,8 @@ class BrozzlerWorker: self._window_height = window_height self._window_width = window_width self._stealth = stealth - self._registry_url = registry_url self._metrics_port = metrics_port + self._registry_url = registry_url self._env = env self._browser_pool = brozzler.browser.BrowserPool( From 4a60ff3367b5a4c8606c3447ee3027d6cf0df4be Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2024 17:01:35 -0700 Subject: [PATCH 16/55] post-deploy bug fixes --- brozzler/cli.py | 6 +++--- brozzler/metrics.py | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 1db25b0..431b57e 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -297,7 +297,7 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - metrics_port=args.metrics_port, + metrics_port=int(args.metrics_port), registry_url=args.registry_url, env=args.env, ) @@ -540,7 +540,7 @@ def brozzler_worker(argv=None): ) arg_parser.add_argument( "--metrics_port", - dest=metrics_port, + dest="metrics_port", default=8888, help="Prometheus metrics port", ) @@ -612,7 +612,7 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - metrics_port=args.metrics_port, + metrics_port=int(args.metrics_port), registry_url=args.registry_url, env=args.env, ) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 3698de5..adf0184 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -42,12 +42,18 @@ def register_prom_metrics( if registry_url is None: return + env_for_prom = None + if env == "qa": + env_for_prom = Env.qa + elif env == "prod": + env_for_prom = Env.prod + config = ClientConfig(server_url_base=registry_url) client = Client(config) target = format_self_target(scrape_port=metrics_port) registration = Registration( target=target, - env=env, + env=env_for_prom, scheme=Scheme.http, ) client.keep_registered_threaded(registration) From bb1c3437248b612cd5fbecebf1bed0a5b80205d0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 17 Sep 2024 17:53:50 -0700 Subject: [PATCH 17/55] updates for review of PR 287 --- brozzler/cli.py | 10 ++++++---- brozzler/metrics.py | 10 ++++++---- brozzler/ydl.py | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 431b57e..11226fa 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -236,15 +236,16 @@ def brozzle_page(argv=None): ) arg_parser.add_argument( "--metrics_port", + type=int, dest="metrics_port", default=8888, - help="Prometheus metrics port", + help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( "--registry_url", dest="registry_url", default=None, - help="Prometheus scrape target registry URL", + help="http-sd-registry url, for Prometheus metrics discovery", ) arg_parser.add_argument( "--env", @@ -540,15 +541,16 @@ def brozzler_worker(argv=None): ) arg_parser.add_argument( "--metrics_port", + type=int, dest="metrics_port", default=8888, - help="Prometheus metrics port", + help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( "--registry_url", dest="registry_url", default=None, - help="Prometheus scrape target registry URL", + help="http-sd-registry url, for Prometheus metrics discovery", ) arg_parser.add_argument( "--env", diff --git a/brozzler/metrics.py b/brozzler/metrics.py index adf0184..5ff36ed 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -10,6 +10,7 @@ try: ) from http_sd_registry.config import ClientConfig except ImportError: + # for users without access to http_sd_registry http_sd_registry = None @@ -19,15 +20,15 @@ from prometheus_client import Counter, Gauge, Histogram, start_http_server brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") -brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") +brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) -brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on @@ -42,11 +43,12 @@ def register_prom_metrics( if registry_url is None: return - env_for_prom = None if env == "qa": env_for_prom = Env.qa elif env == "prod": env_for_prom = Env.prod + else: + env_for_prom = Env.qa config = ClientConfig(server_url_base=registry_url) client = Client(config) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 81c1ec4..392bb2d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -303,12 +303,12 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] + youtube_host = "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] attempt = 0 while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ytdlp_url) - metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1) + metrics.brozzler_ydl_download_attempts.labels(youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -317,7 +317,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - metrics.brozzler_ydl_download_successes.labels(ytdlp_host).inc(1) + metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise From 8b2c254485afee2c021299d3caa53633e22cef76 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 14:17:10 -0700 Subject: [PATCH 18/55] brozzler_ydl_extract, not download --- brozzler/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 5ff36ed..4b1277d 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -27,8 +27,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) -brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_extract_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_extract_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on From 62b12434d0a0d626a756e9fa70a14d5a5f7cce52 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 14:51:11 -0700 Subject: [PATCH 19/55] mostly ydl.py updates for new proxyrack testing --- brozzler/ydl.py | 82 +++++++++++++++++-------------------------------- 1 file changed, 28 insertions(+), 54 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 392bb2d..7368534 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,7 @@ import time thread_local = threading.local() PROXYRACK_PROXY = "@@@" -MAX_YTDLP_ATTEMPTS = 4 +MAX_YTDLP_ATTEMPTS = 3 YTDLP_WAIT = 10 @@ -197,15 +197,15 @@ def _build_youtube_dl(worker, destdir, site, page): payload=f, extra_headers=extra_headers, ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } - ) + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed @@ -268,6 +268,9 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, + # does this make sense when we're generally downloading one at a time? + "sleep_interval": 25, + "max_sleep_interval": 90, "proxy": PROXYRACK_PROXY, } @@ -308,7 +311,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ytdlp_url) - metrics.brozzler_ydl_download_attempts.labels(youtube_host).inc(1) + # should_download_vid = not youtube_host + # then + # ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)), download=should_download_vid) + # if youtube_host and ie_result: + # download_url = ie_result.get("url") + metrics.brozzler_ydl_extract_attempts.labels(youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -317,7 +325,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -366,48 +374,14 @@ def _try_youtube_dl(worker, ydl, site, page): "with yt-dlp json for %s", ytdlp_url, ) - - attempt = 0 - while attempt < MAX_YTDLP_ATTEMPTS: - try: - worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), - warc_type="metadata", - content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers(page), - ) - break - except Exception as e: - # connection problem when using a proxy == proxy error - if ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.URLError - and worker._proxy_for(site) - ): - attempt += 1 - if attempt == MAX_YTDLP_ATTEMPTS: - logging.warning( - "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e - ) - raise brozzler.ProxyError( - "yt-dlp hit proxy error storing media from %s with " - % ytdlp_url - ) - else: - logging.info( - "Attempt %s failed. Retrying in %s seconds...", - attempt, - YTDLP_WAIT, - ) - time.sleep(YTDLP_WAIT) - else: - raise - else: - raise brozzler.ProxyError( - "Proxy attempt(s) storing media failed for unknown reason(s)" - ) + worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + warc_type="metadata", + content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers(page), + ) return ie_result From 27cb104b4566f548e924354102a6f1a364c81f73 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 15:57:47 -0700 Subject: [PATCH 20/55] more extract (less download) --- brozzler/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 4b1277d..0ca3e43 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -27,8 +27,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_extract_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"]) -brozzler_ydl_extract_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on From 2aa17886372c9fb1ff3488ad17ada00b962a6782 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 15:58:19 -0700 Subject: [PATCH 21/55] mostly black'd --- brozzler/ydl.py | 51 +++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7368534..bc59ffd 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,7 @@ import time thread_local = threading.local() PROXYRACK_PROXY = "@@@" -MAX_YTDLP_ATTEMPTS = 3 +MAX_YTDLP_ATTEMPTS = 4 YTDLP_WAIT = 10 @@ -184,28 +184,31 @@ def _build_youtube_dl(worker, destdir, site, page): worker._proxy_for(site), url, ) - with open(info_dict["filepath"], "rb") as f: - # include content-length header to avoid chunked - # transfer, which warcprox currently rejects - extra_headers = dict(site.extra_headers()) - extra_headers["content-length"] = size - request, response = worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url=url, - warc_type="resource", - content_type=mimetype, - payload=f, - extra_headers=extra_headers, + try: + with open(info_dict["filepath"], "rb") as f: + # include content-length header to avoid chunked + # transfer, which warcprox currently rejects + extra_headers = dict(site.extra_headers()) + extra_headers["content-length"] = size + request, response = worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url=url, + warc_type="resource", + content_type=mimetype, + payload=f, + extra_headers=extra_headers, + ) + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } - ) + except: + traceback.print_exc() def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed @@ -306,7 +309,9 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] + youtube_host = ( + "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] + ) attempt = 0 while attempt < MAX_YTDLP_ATTEMPTS: try: From f624e7ee8ed297ffbe9a797d54d77f19c9e08a0f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 17:26:23 -0700 Subject: [PATCH 22/55] bump qa-ish version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f3698a9..2dc8928 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.54", + version="1.5.55a3", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 7fbee54eaae81e2f33cf42941ef3413dcc063079 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 19 Sep 2024 10:45:01 -0700 Subject: [PATCH 23/55] proxyrack only for youtube.com --- brozzler/ydl.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index bc59ffd..63c8670 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,7 +33,7 @@ import time thread_local = threading.local() -PROXYRACK_PROXY = "@@@" +PROXYRACK_PROXY = "" MAX_YTDLP_ATTEMPTS = 4 YTDLP_WAIT = 10 @@ -271,13 +271,19 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, - # does this make sense when we're generally downloading one at a time? + # recommended to avoid bot detection "sleep_interval": 25, "max_sleep_interval": 90, - "proxy": PROXYRACK_PROXY, } - # skip proxying yt-dlp v.2023.07.06 + ytdlp_url = page.redirect_url if page.redirect_url else page.url + youtube_host = ( + "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] + ) + if youtube_host: + ydl_opts["proxy"] = PROXYRACK_PROXY + + # skip warcprox proxying yt-dlp v.2023.07.06 # if worker._proxy_for(site): # ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) @@ -285,6 +291,8 @@ def _build_youtube_dl(worker, destdir, site, page): if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] + ydl.url = ytdlp_url + ydl.youtube_host = youtube_host return ydl @@ -308,29 +316,25 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): - ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = ( - "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] - ) attempt = 0 while attempt < MAX_YTDLP_ATTEMPTS: try: - logging.info("trying yt-dlp on %s", ytdlp_url) - # should_download_vid = not youtube_host + logging.info("trying yt-dlp on %s", ydl.url) + # should_download_vid = not ydl.youtube_host # then - # ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)), download=should_download_vid) - # if youtube_host and ie_result: + # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) + # if ydl.youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_attempts.labels(ydl.youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info( - ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) + ydl.extract_info(str(urlcanon.whatwg(ydl.url))) ) - metrics.brozzler_ydl_extract_successes.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(ydl.youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -348,6 +352,7 @@ def _try_youtube_dl(worker, ydl, site, page): ): raise brozzler.ReachedLimit(e.exc_info[1]) else: + # todo: other errors to handle separately? # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) # and others... attempt += 1 @@ -356,7 +361,7 @@ def _try_youtube_dl(worker, ydl, site, page): "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) raise brozzler.ProxyError( - "yt-dlp hit proxyrack proxy error from %s" % ytdlp_url + "yt-dlp hit possible proxyrack proxy error from %s" % ydl.url ) else: logging.info( @@ -377,11 +382,11 @@ def _try_youtube_dl(worker, ydl, site, page): logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", - ytdlp_url, + ydl.url, ) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + url="youtube-dl:%s" % str(urlcanon.semantic(ydl.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), From 229d53d4b16fb6245a6327c13004d41830e23cf2 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 19 Sep 2024 16:45:22 -0700 Subject: [PATCH 24/55] add brozzler_ydl_download_successes metric --- brozzler/metrics.py | 1 + brozzler/ydl.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 0ca3e43..ad5737a 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -29,6 +29,7 @@ brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time s brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"]) brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"]) +brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 63c8670..747fa64 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,7 +33,7 @@ import time thread_local = threading.local() -PROXYRACK_PROXY = "" +YTDLP_PROXY = "" MAX_YTDLP_ATTEMPTS = 4 YTDLP_WAIT = 10 @@ -239,6 +239,14 @@ def _build_youtube_dl(worker, destdir, site, page): worker.logger.info( "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) ) + youtube_host = ( + "youtube.com" + in d["info_dict"]["webpage_url"] + .split("//")[-1] + .split("/")[0] + .split("?")[0] + ) + metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) if worker._using_warcprox(site): _YoutubeDL._push_video_to_warcprox( _YoutubeDL, site, d["info_dict"], d["postprocessor"] @@ -281,7 +289,8 @@ def _build_youtube_dl(worker, destdir, site, page): "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] ) if youtube_host: - ydl_opts["proxy"] = PROXYRACK_PROXY + ydl_opts["proxy"] = YTDLP_PROXY + logging.info("using yt-dlp proxy %s", YTDLP_PROXY) # skip warcprox proxying yt-dlp v.2023.07.06 # if worker._proxy_for(site): @@ -361,7 +370,7 @@ def _try_youtube_dl(worker, ydl, site, page): "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) raise brozzler.ProxyError( - "yt-dlp hit possible proxyrack proxy error from %s" % ydl.url + "yt-dlp hit possible external proxy error from %s" % ydl.url ) else: logging.info( @@ -371,9 +380,7 @@ def _try_youtube_dl(worker, ydl, site, page): ) time.sleep(YTDLP_WAIT) else: - raise brozzler.ProxyError( - "Proxyrack proxy attempt(s) failed for unknown reason(s)" - ) + raise brozzler.ProxyError("Proxy attempt(s) failed for unknown reason(s)") logging.info("ytdlp completed successfully") _remember_videos(page, ydl.pushed_videos) From c74d9ad90fa6e8427e81505bc93e4c8ad7a452f0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 19 Sep 2024 20:53:03 -0700 Subject: [PATCH 25/55] limit proxy logging --- brozzler/ydl.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 747fa64..ae56fa7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -288,11 +288,14 @@ def _build_youtube_dl(worker, destdir, site, page): youtube_host = ( "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] ) - if youtube_host: + if youtube_host and YTDLP_PROXY: ydl_opts["proxy"] = YTDLP_PROXY - logging.info("using yt-dlp proxy %s", YTDLP_PROXY) + ytdlp_proxy_for_print = ( + YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" + ) + logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_print) - # skip warcprox proxying yt-dlp v.2023.07.06 + # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges # if worker._proxy_for(site): # ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) From 7de648345360b2799c7bd89346e2133a4b486262 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 20 Sep 2024 14:43:05 -0700 Subject: [PATCH 26/55] skip brozzler_resources metrics --- brozzler/metrics.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index ad5737a..81239bc 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -22,10 +22,6 @@ brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_ brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") -brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) -brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) -brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) -brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"]) brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"]) From 3e9030a376010583ec12e5c982d5d2bfa53da54c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 15:16:07 -0700 Subject: [PATCH 27/55] rm oddly merged try except block --- brozzler/ydl.py | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ae56fa7..ebfadce 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -184,31 +184,29 @@ def _build_youtube_dl(worker, destdir, site, page): worker._proxy_for(site), url, ) - try: - with open(info_dict["filepath"], "rb") as f: - # include content-length header to avoid chunked - # transfer, which warcprox currently rejects - extra_headers = dict(site.extra_headers()) - extra_headers["content-length"] = size - request, response = worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url=url, - warc_type="resource", - content_type=mimetype, - payload=f, - extra_headers=extra_headers, - ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } + with open(info_dict["filepath"], "rb") as f: + # include content-length header to avoid chunked + # transfer, which warcprox currently rejects + extra_headers = dict(site.extra_headers()) + extra_headers["content-length"] = size + request, response = worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url=url, + warc_type="resource", + content_type=mimetype, + payload=f, + extra_headers=extra_headers, ) - except: - traceback.print_exc() + + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed From fb43d3f2a49007d8bbd0ae479930b03887005a18 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 15:20:29 -0700 Subject: [PATCH 28/55] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2dc8928..4b556f1 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.55a3", + version="1.5.55", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 6a0b0b058d4e1860073b3abd7bd9ab1e481cee04 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 18:37:14 -0700 Subject: [PATCH 29/55] updates post-walkthru --- brozzler/__init__.py | 4 ++++ brozzler/cli.py | 12 +++++------ brozzler/worker.py | 14 ++++++++++--- brozzler/ydl.py | 49 ++++++++++++++++++++++---------------------- 4 files changed, 46 insertions(+), 33 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 5040e69..7dd284d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -39,6 +39,10 @@ class PageInterstitialShown(Exception): pass +class VideoExtractorError(Exception): + pass + + class ProxyError(Exception): pass diff --git a/brozzler/cli.py b/brozzler/cli.py index 11226fa..3cb7c9a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -238,7 +238,7 @@ def brozzle_page(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -251,7 +251,7 @@ def brozzle_page(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" @@ -298,7 +298,7 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) @@ -543,7 +543,7 @@ def brozzler_worker(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -556,7 +556,7 @@ def brozzler_worker(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) add_common_options(arg_parser, argv) @@ -614,7 +614,7 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) diff --git a/brozzler/worker.py b/brozzler/worker.py index e4f984d..c780644 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -72,7 +72,7 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, - metrics_port=None, + metrics_port=0, registry_url=None, env=None, ): @@ -111,8 +111,11 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() - # Setup metrics - metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + # set up metrics + if self._metrics_port > 0: + metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + else: + logging.warning("not starting prometheus scrape endpoint: metrics_port is undefined") def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") @@ -284,6 +287,11 @@ class BrozzlerWorker: raise except brozzler.ProxyError: raise + except brozzler.VideoExtractorError as e: + logging.error( + "error extracting video info: %s", + e, + ) except Exception as e: if ( hasattr(e, "exc_info") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ebfadce..b1266fd 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -71,6 +71,11 @@ def should_ytdlp(site, page, page_status, skip_av_seeds): return True +def isyoutubehost(url): + # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname + return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] + + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers @@ -237,14 +242,9 @@ def _build_youtube_dl(worker, destdir, site, page): worker.logger.info( "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) ) - youtube_host = ( - "youtube.com" - in d["info_dict"]["webpage_url"] - .split("//")[-1] - .split("/")[0] - .split("?")[0] - ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"]) + + metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1) if worker._using_warcprox(site): _YoutubeDL._push_video_to_warcprox( _YoutubeDL, site, d["info_dict"], d["postprocessor"] @@ -283,15 +283,14 @@ def _build_youtube_dl(worker, destdir, site, page): } ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = ( - "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] - ) - if youtube_host and YTDLP_PROXY: + is_youtube_host = isyoutubehost(ytdlp_url) + if is_youtube_host and YTDLP_PROXY: ydl_opts["proxy"] = YTDLP_PROXY - ytdlp_proxy_for_print = ( + # don't log proxy value secrets + ytdlp_proxy_for_logs = ( YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" ) - logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_print) + logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges # if worker._proxy_for(site): @@ -302,7 +301,7 @@ def _build_youtube_dl(worker, destdir, site, page): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] ydl.url = ytdlp_url - ydl.youtube_host = youtube_host + ydl.is_youtube_host = is_youtube_host return ydl @@ -330,12 +329,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ydl.url) - # should_download_vid = not ydl.youtube_host + # should_download_vid = not ydl.is_youtube_host # then # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) - # if ydl.youtube_host and ie_result: + # if ydl.is_youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_attempts.labels(ydl.is_youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -344,7 +343,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ydl.url))) ) - metrics.brozzler_ydl_extract_successes.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -370,18 +369,20 @@ def _try_youtube_dl(worker, ydl, site, page): logging.warning( "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) - raise brozzler.ProxyError( - "yt-dlp hit possible external proxy error from %s" % ydl.url + raise brozzler.VideoExtractorError( + "yt-dlp hit error extracting info for %s" % ydl.url ) else: + retry_wait = min(60, YTDLP_WAIT * (1.5**(attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, - YTDLP_WAIT, + retry_wait, ) - time.sleep(YTDLP_WAIT) + time.sleep(retry_wait) else: - raise brozzler.ProxyError("Proxy attempt(s) failed for unknown reason(s)") + raise brozzler.VideoExtractorError("yt-dlp hit unknown error extracting info for %s" % ydl.url) + logging.info("ytdlp completed successfully") _remember_videos(page, ydl.pushed_videos) From 9983f43c7508cb469b9757060c785f678eba18ab Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Sep 2024 19:53:03 -0700 Subject: [PATCH 30/55] black'd --- brozzler/worker.py | 8 ++++++-- brozzler/ydl.py | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index c780644..6eb5872 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -113,9 +113,13 @@ class BrozzlerWorker: # set up metrics if self._metrics_port > 0: - metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + metrics.register_prom_metrics( + self._metrics_port, self._registry_url, self._env + ) else: - logging.warning("not starting prometheus scrape endpoint: metrics_port is undefined") + logging.warning( + "not starting prometheus scrape endpoint: metrics_port is undefined" + ) def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b1266fd..f3f21be 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -373,7 +373,7 @@ def _try_youtube_dl(worker, ydl, site, page): "yt-dlp hit error extracting info for %s" % ydl.url ) else: - retry_wait = min(60, YTDLP_WAIT * (1.5**(attempt - 1))) + retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, @@ -381,7 +381,9 @@ def _try_youtube_dl(worker, ydl, site, page): ) time.sleep(retry_wait) else: - raise brozzler.VideoExtractorError("yt-dlp hit unknown error extracting info for %s" % ydl.url) + raise brozzler.VideoExtractorError( + "yt-dlp hit unknown error extracting info for %s" % ydl.url + ) logging.info("ytdlp completed successfully") From 91850240508bf63780ee670e2c6df475b17a9002 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 2 Oct 2024 16:55:25 -0700 Subject: [PATCH 31/55] add skip_utf8_validation --- brozzler/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index ca92ffa..c873d58 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -201,7 +201,7 @@ class WebsockReceiverThread(threading.Thread): # in addition to its documented purpose, and must have a value to avoid # hangs in certain situations self.websock.run_forever( - sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5 + sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5, skip_utf8_validation=True, ) def _on_message(self, websock, message): From c40eb17935fc7934e74f43d215b76545c516e935 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 2 Oct 2024 17:34:30 -0700 Subject: [PATCH 32/55] black'd --- brozzler/browser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index c873d58..88490c2 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -201,7 +201,9 @@ class WebsockReceiverThread(threading.Thread): # in addition to its documented purpose, and must have a value to avoid # hangs in certain situations self.websock.run_forever( - sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5, skip_utf8_validation=True, + sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), + ping_timeout=0.5, + skip_utf8_validation=True, ) def _on_message(self, websock, message): From 28b8149963a638a3a7b86c5a70a680e4aac9fd04 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 2 Oct 2024 17:38:30 -0700 Subject: [PATCH 33/55] add comment re fix --- brozzler/browser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/brozzler/browser.py b/brozzler/browser.py index 88490c2..9c72589 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -200,6 +200,9 @@ class WebsockReceiverThread(threading.Thread): # ping_timeout is used as the timeout for the call to select.select() # in addition to its documented purpose, and must have a value to avoid # hangs in certain situations + # + # skip_ut8_validation is a recommended performance improvement: + # https://websocket-client.readthedocs.io/en/latest/faq.html#why-is-this-library-slow self.websock.run_forever( sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5, From 151777298a31219656e166d304a1d44087e3995c Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Thu, 3 Oct 2024 15:50:20 -0700 Subject: [PATCH 34/55] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4b556f1..57fb4ac 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.55", + version="1.5.56", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From f72fa2b102c4b5d9868186854bb6b375e5479956 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 7 Oct 2024 15:01:53 -0700 Subject: [PATCH 35/55] updated prometheus metrics --- brozzler/metrics.py | 11 ++++++++--- brozzler/worker.py | 8 ++++++-- brozzler/ydl.py | 3 ++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 81239bc..8dd10c0 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -17,13 +17,18 @@ except ImportError: from prometheus_client import Counter, Gauge, Histogram, start_http_server # fmt: off -brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") +brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") +brozzler_in_progress_headers = Gauge("brozzler_in_progress_headers", "number of headers currently processing with brozzler") +brozzler_header_processing_duration_seconds = Histogram("brozzler_header_processing_duration_seconds", "time spent processing one page's headers in brozzler") +brozzler_in_progress_browses = Gauge("brozzler_in_progress_browse", "number of pages currently browsing with brozzler") +brozzler_browsing_duration_seconds = Histogram("brozzler_browsing_duration_seconds", "time spent browsing a page in brozzler") +brozzler_in_progress_ytdlps = Gauge("brozzler_in_progress_ytdlp", "number of ytdlp sessions currently in progress with brozzler") +brozzler_ytdlp_duration_seconds = Histogram("brozzler_ytdlp_duration_seconds", "time spent running ytdlp for a page in brozzler") +brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch") -brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"]) brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"]) brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"]) # fmt: on diff --git a/brozzler/worker.py b/brozzler/worker.py index 6eb5872..d289911 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -251,6 +251,8 @@ class BrozzlerWorker: img.save(out, "jpeg", quality=95) return out.getbuffer() + @metrics.brozzler_page_processing_duration_seconds.time() + @metrics.brozzler_in_progress_pages.track_inprogress() def brozzle_page( self, browser, @@ -315,6 +317,8 @@ class BrozzlerWorker: ) return outlinks + @metrics.brozzler_in_progress_headers.time() + @metrics.brozzler_header_processing_duration_seconds.track_inprogress() def _get_page_headers(self, page): # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow @@ -334,8 +338,8 @@ class BrozzlerWorker: return False return True - @metrics.brozzler_page_processing_duration_seconds.time() - @metrics.brozzler_in_progress_pages.track_inprogress() + @metrics.brozzler_in_progress_browses.time() + @metrics.brozzler_browsing_duration_seconds.track_inprogress() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def update_page_metrics(page, outlinks): """Update page-level Prometheus metrics.""" diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f3f21be..a72b904 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -334,7 +334,6 @@ def _try_youtube_dl(worker, ydl, site, page): # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) # if ydl.is_youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(ydl.is_youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -406,6 +405,8 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result +@metrics.brozzler_in_progress_ytdlps.time() +@metrics.brozzler_ytdlp_duration_seconds.track_inprogress() def do_youtube_dl(worker, site, page): """ Runs yt-dlp configured for `worker` and `site` to download videos from From d7c6c579083e34de6c3cf16d8fb4d886ad8ad418 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 7 Oct 2024 15:26:18 -0700 Subject: [PATCH 36/55] fix plural typo --- brozzler/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 8dd10c0..c2c7665 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -21,9 +21,9 @@ brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of page brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") brozzler_in_progress_headers = Gauge("brozzler_in_progress_headers", "number of headers currently processing with brozzler") brozzler_header_processing_duration_seconds = Histogram("brozzler_header_processing_duration_seconds", "time spent processing one page's headers in brozzler") -brozzler_in_progress_browses = Gauge("brozzler_in_progress_browse", "number of pages currently browsing with brozzler") +brozzler_in_progress_browses = Gauge("brozzler_in_progress_browses", "number of pages currently browsing with brozzler") brozzler_browsing_duration_seconds = Histogram("brozzler_browsing_duration_seconds", "time spent browsing a page in brozzler") -brozzler_in_progress_ytdlps = Gauge("brozzler_in_progress_ytdlp", "number of ytdlp sessions currently in progress with brozzler") +brozzler_in_progress_ytdlps = Gauge("brozzler_in_progress_ytdlps", "number of ytdlp sessions currently in progress with brozzler") brozzler_ytdlp_duration_seconds = Histogram("brozzler_ytdlp_duration_seconds", "time spent running ytdlp for a page in brozzler") brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") From de7a325377719886fb51f3260844938592664c16 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 7 Oct 2024 15:50:32 -0700 Subject: [PATCH 37/55] histograms track_inprogress ... --- brozzler/worker.py | 8 ++++---- brozzler/ydl.py | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index d289911..116bb75 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -317,8 +317,8 @@ class BrozzlerWorker: ) return outlinks - @metrics.brozzler_in_progress_headers.time() - @metrics.brozzler_header_processing_duration_seconds.track_inprogress() + @metrics.brozzler_header_processing_duration_seconds.time() + @metrics.brozzler_in_progress_headers.track_inprogress() def _get_page_headers(self, page): # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow @@ -338,8 +338,8 @@ class BrozzlerWorker: return False return True - @metrics.brozzler_in_progress_browses.time() - @metrics.brozzler_browsing_duration_seconds.track_inprogress() + @metrics.brozzler_browsing_duration_seconds.time() + @metrics.brozzler_in_progress_browses.track_inprogress() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def update_page_metrics(page, outlinks): """Update page-level Prometheus metrics.""" diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a72b904..682b099 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -404,9 +404,8 @@ def _try_youtube_dl(worker, ydl, site, page): ) return ie_result - -@metrics.brozzler_in_progress_ytdlps.time() -@metrics.brozzler_ytdlp_duration_seconds.track_inprogress() +@metrics.brozzler_ytdlp_duration_seconds.time() +@metrics.brozzler_in_progress_ytdlps.track_inprogress() def do_youtube_dl(worker, site, page): """ Runs yt-dlp configured for `worker` and `site` to download videos from From 93d256a7b5330a7b88e14778228331dfa140a31a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 7 Oct 2024 16:17:49 -0700 Subject: [PATCH 38/55] black'd --- brozzler/ydl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 682b099..09593f1 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -404,6 +404,7 @@ def _try_youtube_dl(worker, ydl, site, page): ) return ie_result + @metrics.brozzler_ytdlp_duration_seconds.time() @metrics.brozzler_in_progress_ytdlps.track_inprogress() def do_youtube_dl(worker, site, page): From 8007d61a0f4a4a97d6ce2418b51e86b770b583cd Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:21:04 -0700 Subject: [PATCH 39/55] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 57fb4ac..ebea44c 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.56", + version="1.5.57", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From f39cc632ed24334d8d3bd2b670eeccdfb1cf3df4 Mon Sep 17 00:00:00 2001 From: vbanos Date: Thu, 17 Oct 2024 17:44:17 +0200 Subject: [PATCH 40/55] Add support for Chrome > 110 --- brozzler/chrome.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index f3a4841..a83632b 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -184,6 +184,7 @@ class Chrome: self.chrome_exe, "-v", "--remote-debugging-port=%s" % self.port, + "--remote-allow-origins=http://localhost:%s" % self.port, "--use-mock-keychain", # mac thing "--user-data-dir=%s" % self._chrome_user_data_dir, "--disable-background-networking", From 631a7f40c2a76cd61ecb77d1f8e18f09fdd439b8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 17 Oct 2024 09:22:30 -0700 Subject: [PATCH 41/55] move doublethink to extras_require --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ebea44c..0effd89 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,6 @@ setuptools.setup( "websocket-client==1.8.0", "pillow>=5.2.0", "urlcanon>=0.1.dev23", - "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311", "rethinkdb==2.4.9", "cerberus>=1.0.1", "jinja2>=2.10", @@ -81,6 +80,7 @@ setuptools.setup( ], extras_require={ "yt-dlp": ["yt-dlp>=2024.7.25"], + "doublethink": ["doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311"], "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], "easy": [ "warcprox>=2.4.31", From 5ac681cb37b859d0813ea5f71e626a185421bde3 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 17 Oct 2024 09:24:21 -0700 Subject: [PATCH 42/55] black'd --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0effd89..e31567d 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,9 @@ setuptools.setup( ], extras_require={ "yt-dlp": ["yt-dlp>=2024.7.25"], - "doublethink": ["doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311"], + "doublethink": [ + "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311" + ], "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], "easy": [ "warcprox>=2.4.31", From 3a8b9d2d05718bed1e6ee5466357eaf2c226fa41 Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:49:46 -0700 Subject: [PATCH 43/55] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e31567d..a2997af 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.57", + version="1.5.58", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 741a30feee3da7693a1e4049ba6c5d3620e19eb3 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 30 Oct 2024 12:02:17 -0700 Subject: [PATCH 44/55] draft workflow to publish build artifacts --- .github/workflows/publish-artifacts.yml | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/publish-artifacts.yml diff --git a/.github/workflows/publish-artifacts.yml b/.github/workflows/publish-artifacts.yml new file mode 100644 index 0000000..d2175ab --- /dev/null +++ b/.github/workflows/publish-artifacts.yml @@ -0,0 +1,32 @@ +name: Publish Artifacts + +on: + push: + branches: + - main + - master + +jobs: + build: + name: Build distribution 📦 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: "3.8" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ \ No newline at end of file From b07f939c3733afe8e144121e4c6e24b104db7896 Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:22:45 -0700 Subject: [PATCH 45/55] bump version bump version, to 1.6, for setting up publishing build artifacts... --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a2997af..68aca64 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.58", + version="1.6", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 4ce5f7a61f2bbc98f6cc97c5afd9b00ffcf34bd9 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 30 Oct 2024 15:44:27 -0700 Subject: [PATCH 46/55] update README, yt-dlp, not youtube-dl --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b2fe7f2..b7d9dab 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome or Chromium) to fetch pages and embedded URLs and to extract links. It employs -`youtube-dl `_ to enhance media capture +`yt-dlp `_ (formerly youtube-dl) to enhance media capture capabilities and `rethinkdb `_ to manage crawl state. From 39a3c7cc8b5a46e8f307b5aa9a5b572e43f87a35 Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:56:02 -0700 Subject: [PATCH 47/55] Update workflow publish-artifacts.yml --- .github/workflows/publish-artifacts.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-artifacts.yml b/.github/workflows/publish-artifacts.yml index d2175ab..47c96a3 100644 --- a/.github/workflows/publish-artifacts.yml +++ b/.github/workflows/publish-artifacts.yml @@ -5,6 +5,10 @@ on: branches: - main - master + pull_request: + branches: + - main + - master jobs: build: @@ -29,4 +33,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: python-package-distributions - path: dist/ \ No newline at end of file + path: dist/ From d1e6e0f632da8435f1e939918db3be01111746e6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 31 Oct 2024 11:04:40 -0700 Subject: [PATCH 48/55] update doublethink dependency --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a2997af..0da1fe7 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,7 @@ setuptools.setup( "pillow>=5.2.0", "urlcanon>=0.1.dev23", "rethinkdb==2.4.9", + "doublethink==0.4.9", "cerberus>=1.0.1", "jinja2>=2.10", "cryptography>=2.3", @@ -80,9 +81,6 @@ setuptools.setup( ], extras_require={ "yt-dlp": ["yt-dlp>=2024.7.25"], - "doublethink": [ - "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311" - ], "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], "easy": [ "warcprox>=2.4.31", From 09ec0b10788c755decd56f52b97c9ffd45f2011f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sat, 2 Nov 2024 09:33:28 -0700 Subject: [PATCH 49/55] update copyright notice --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b7d9dab..13ab3d4 100644 --- a/README.rst +++ b/README.rst @@ -190,7 +190,7 @@ this has not yet been extensively tested. License ------- -Copyright 2015-2018 Internet Archive +Copyright 2015-2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with the License. You may From 2b4f40a3e6b97674206be37e709739473ec0b971 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sat, 2 Nov 2024 09:35:03 -0700 Subject: [PATCH 50/55] facilitate library use --- __init.py__ | 0 setup.py | 8 +++----- 2 files changed, 3 insertions(+), 5 deletions(-) create mode 100644 __init.py__ diff --git a/__init.py__ b/__init.py__ new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index d4f2edb..41c29be 100644 --- a/setup.py +++ b/setup.py @@ -71,8 +71,6 @@ setuptools.setup( "websocket-client==1.8.0", "pillow>=5.2.0", "urlcanon>=0.1.dev23", - "rethinkdb==2.4.9", - "doublethink==0.4.9", "cerberus>=1.0.1", "jinja2>=2.10", "cryptography>=2.3", @@ -87,6 +85,8 @@ setuptools.setup( "pywb>=0.33.2,<2", "flask>=1.0", "gunicorn>=19.8.1", + "rethinkdb==2.4.9", + "doublethink==0.4.9", ], }, zip_safe=False, @@ -94,9 +94,7 @@ setuptools.setup( "Development Status :: 5 - Production/Stable", "Environment :: Console", "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Internet :: WWW/HTTP", "Topic :: System :: Archiving", ], From 413fc131202c5d9c99f3501c43961823c1f9bc40 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sat, 2 Nov 2024 09:36:12 -0700 Subject: [PATCH 51/55] for pypi --- pyproject.toml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8335651 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "brozzler_ia" +authors = [ + { name="Noah Levitt", email="nlevitt@archive.org" }, +] +maintainers = [ + { name="Vangelis Banos", email="vangelis@archive.org" }, + { name="Adam Miller", email="adam@archive.org" }, + { name="Barbara Miller", email="barbara@archive.org" }, + { name="Alex Dempsey", email="avdempsey@archive.org" }, +] +description = "Distributed web crawling with browsers" +readme = "README.rst" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ] + +[project.urls] +Homepage = "https://github.com/internetarchive/brozzler" +Issues = "https://github.com/internetarchive/brozzler/issues" +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" From 07a6b9845aa8f93f2999c6284ff8150aee0516de Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Sat, 2 Nov 2024 10:20:13 -0700 Subject: [PATCH 52/55] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 41c29be..eea2d5d 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.6", + version="1.6.1", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From c4e5dc63fc9ace56ce8340e9e85250a8d3d5ab49 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 5 Nov 2024 17:26:59 -0800 Subject: [PATCH 53/55] minor updates for v.1.6.2 (for pypi) --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8335651..9880da2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "brozzler_ia" +name = "brozzler" authors = [ { name="Noah Levitt", email="nlevitt@archive.org" }, ] diff --git a/setup.py b/setup.py index eea2d5d..25419d5 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.6.1", + version="1.6.2", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From a52765d2ca1fd2a562ad13f05756261e38315f44 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 12 Nov 2024 21:59:49 +0200 Subject: [PATCH 54/55] Disable automatic http to https transformation Chrome 130 automatically converts http to https even if the target URL is http. We disable this behavior because some target sites simply don't have https. --- brozzler/chrome.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index a83632b..6194a1a 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -199,6 +199,7 @@ class Chrome: "--disable-first-run-ui", "--no-first-run", "--homepage=about:blank", + "--disable-features=HttpsUpgrades", "--disable-direct-npapi-requests", "--disable-web-security", "--disable-notifications", From d9d09996b0a0f18da40caba11b7b899016dd331a Mon Sep 17 00:00:00 2001 From: Barbara Miller <3253863+galgeek@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:22:53 -0800 Subject: [PATCH 55/55] bump version to 1.6.3 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 25419d5..a8633e1 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.6.2", + version="1.6.3", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt",