diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 8f9d87a..36ece6e 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -15,9 +15,9 @@ except ImportError: from prometheus_client import Counter, Gauge, Histogram, start_http_server # fmt: off -brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler", labelnames=["host"]) -brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler", labelnames=["host"]) -brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler", labelnames=["host"]) +brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler") +brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler") +brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) @@ -25,8 +25,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") -brozzler_ydl_download_attempts= Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) -brozzler_ydl_download_successes= Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["host"]) +brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["host"]) # fmt: on if http_sd_registry: diff --git a/brozzler/worker.py b/brozzler/worker.py index 6cf5836..11f0efb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -267,6 +267,7 @@ class BrozzlerWorker: ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) + metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: raise @@ -312,6 +313,7 @@ class BrozzlerWorker: return False return True + @metrics.brozzler_page_processing_duration_seconds.time() def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def update_page_metrics(page, outlinks): """Update page-level Prometheus metrics.""" diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b292129..2f07e9d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -27,6 +27,7 @@ import os import json import doublethink import datetime +from . import metrics import threading thread_local = threading.local() @@ -296,9 +297,10 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url + ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split('?')[0] try: logging.info("trying yt-dlp on %s", ytdlp_url) - + metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -307,6 +309,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) + metrics.brozzler_ydl_download_successes.labels(ytdlp_host).inc(1) _remember_videos(page, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4)