diff --git a/brozzler/cli.py b/brozzler/cli.py index 946d464..0ee399e 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -234,6 +234,24 @@ def brozzle_page(argv=None): action="store_true", help="Try to avoid web bot detection", ) + arg_parser.add_argument( + "--metrics_port", + dest="metrics_port", + default=8888, + help="Prometheus metrics port", + ) + arg_parser.add_argument( + "--registry_url", + dest="registry_url", + default=None, + help="Prometheus scrape target registry URL", + ) + arg_parser.add_argument( + "--env", + dest="env", + default=None, + help="env for Prometheus target registry", + ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" ) @@ -279,6 +297,9 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, + metrics_port=args.metrics_port, + registry_url=args.registry_url, + env=args.env, ) def on_screenshot(screenshot_jpeg): @@ -517,6 +538,24 @@ def brozzler_worker(argv=None): action="store_true", help="Try to avoid web bot detection", ) + arg_parser.add_argument( + "--metrics_port", + dest=metrics_port, + default=8888, + help="Prometheus metrics port", + ) + arg_parser.add_argument( + "--registry_url", + dest="registry_url", + default=None, + help="Prometheus scrape target registry URL", + ) + arg_parser.add_argument( + "--env", + dest="env", + default=None, + help="env for Prometheus target registry", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -573,6 +612,9 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, + metrics_port=args.metrics_port, + registry_url=args.registry_url, + env=args.env, ) signal.signal(signal.SIGQUIT, dump_state) diff --git a/brozzler/metrics.py b/brozzler/metrics.py new file mode 100644 index 0000000..3698de5 --- /dev/null +++ b/brozzler/metrics.py @@ -0,0 +1,53 @@ +from typing import Optional + +try: + from http_sd_registry.client import ( + Client, + Env, + Registration, + Scheme, + format_self_target, + ) + from http_sd_registry.config import ClientConfig +except ImportError: + http_sd_registry = None + + +from prometheus_client import Counter, Gauge, Histogram, start_http_server + +# fmt: off +brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") +brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") +brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") +brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") +brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") +brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) +brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) +brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) +brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) +brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") +brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) +# fmt: on + + +def register_prom_metrics( + metrics_port: int = 8888, + registry_url: Optional[str] = None, + env: Optional[str] = None, +): + # Start metrics endpoint for scraping + start_http_server(metrics_port) + + if registry_url is None: + return + + config = ClientConfig(server_url_base=registry_url) + client = Client(config) + target = format_self_target(scrape_port=metrics_port) + registration = Registration( + target=target, + env=env, + scheme=Scheme.http, + ) + client.keep_registered_threaded(registration) diff --git a/brozzler/worker.py b/brozzler/worker.py index 6d3ea12..5c49237 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -35,6 +35,7 @@ import tempfile import urlcanon from requests.structures import CaseInsensitiveDict import rethinkdb as rdb +from . import metrics from . import ydl r = rdb.RethinkDB() @@ -71,6 +72,9 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, + metrics_port=None, + registry_url=None, + env=None, ): self._frontier = frontier self._service_registry = service_registry @@ -93,6 +97,9 @@ class BrozzlerWorker: self._window_height = window_height self._window_width = window_width self._stealth = stealth + self._metrics_port = metrics_port + self._registry_url = registry_url + self._env = env self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True @@ -104,6 +111,9 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() + # Setup metrics + metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") if not warcproxes: @@ -267,6 +277,7 @@ class BrozzlerWorker: ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) + metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: raise @@ -312,7 +323,15 @@ class BrozzlerWorker: return False return True + @metrics.brozzler_page_processing_duration_seconds.time() + @metrics.brozzler_in_progress_pages.track_inprogress() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): + def update_page_metrics(page, outlinks): + """Update page-level Prometheus metrics.""" + metrics.brozzler_last_page_crawled_time.set_to_current_time() + metrics.brozzler_pages_crawled.inc(1) + metrics.brozzler_outlinks_found.inc(len(outlinks)) + def _on_screenshot(screenshot_jpeg): if on_screenshot: on_screenshot(screenshot_jpeg) @@ -417,6 +436,7 @@ class BrozzlerWorker: ) if final_page_url != page.url: page.note_redirect(final_page_url) + update_page_metrics(page, outlinks) return outlinks def _fetch_url(self, site, url=None, page=None): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 9ae5ec6..7ea8b3a 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -29,6 +29,8 @@ import datetime from cassandra import ReadTimeout from cassandra.cluster import Cluster +from . import metrics + import threading import traceback import doublethink @@ -350,11 +352,12 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url + ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] attempt = 0 while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ytdlp_url) - + metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -402,7 +405,7 @@ def _try_youtube_dl(worker, ydl, site, page): "Proxyrack proxy attempt(s) failed for unknown reason(s)" ) logging.info("ytdlp completed successfully") - + metrics.brozzler_ydl_download_successes.labels(ytdlp_host).inc(1) _remember_videos(page, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) diff --git a/setup.py b/setup.py index 4b277ee..c836ff1 100644 --- a/setup.py +++ b/setup.py @@ -78,9 +78,10 @@ setuptools.setup( "jinja2>=2.10", "cryptography>=2.3", "python-magic>=0.4.15", + "prometheus-client>=0.20.0", ], extras_require={ - "yt-dlp": ["yt-dlp==2024.7.25"], + "yt-dlp": ["yt-dlp>=2024.7.25"], "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], "easy": [ "warcprox>=2.4.31",