diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 5040e69..7dd284d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -39,6 +39,10 @@ class PageInterstitialShown(Exception): pass +class VideoExtractorError(Exception): + pass + + class ProxyError(Exception): pass diff --git a/brozzler/cli.py b/brozzler/cli.py index 85eb53e..653e16a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -238,7 +238,7 @@ def brozzle_page(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -251,7 +251,7 @@ def brozzle_page(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" @@ -298,7 +298,7 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) @@ -543,7 +543,7 @@ def brozzler_worker(argv=None): "--metrics_port", type=int, dest="metrics_port", - default=8888, + default=0, help="Port for brozzler's Prometheus scrape endpoint", ) arg_parser.add_argument( @@ -556,7 +556,7 @@ def brozzler_worker(argv=None): "--env", dest="env", default=None, - help="env for Prometheus target registry", + help="deployment environment for this brozzler instance, e.g., prod or qa", ) add_common_options(arg_parser, argv) @@ -614,7 +614,7 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, - metrics_port=int(args.metrics_port), + metrics_port=args.metrics_port, registry_url=args.registry_url, env=args.env, ) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index ad5737a..81239bc 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -22,10 +22,6 @@ brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_ brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler") brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch") brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler") -brozzler_resources_requested = Counter("brozzler_resources_requested", "number of resources requested", labelnames=["resource_type"]) -brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of resources fetched", labelnames=["resource_type", "status_code"]) -brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"]) -brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"]) brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp") brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"]) brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"]) diff --git a/brozzler/worker.py b/brozzler/worker.py index 5c49237..075844b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -72,7 +72,7 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, - metrics_port=None, + metrics_port=0, registry_url=None, env=None, ): @@ -111,8 +111,15 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() - # Setup metrics - metrics.register_prom_metrics(self._metrics_port, self._registry_url, self._env) + # set up metrics + if self._metrics_port > 0: + metrics.register_prom_metrics( + self._metrics_port, self._registry_url, self._env + ) + else: + logging.warning( + "not starting prometheus scrape endpoint: metrics_port is undefined" + ) def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") @@ -285,6 +292,11 @@ class BrozzlerWorker: raise except brozzler.ProxyError: raise + except brozzler.VideoExtractorError as e: + logging.error( + "error extracting video info: %s", + e, + ) except Exception as e: if ( hasattr(e, "exc_info") diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 0f6d102..8691351 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -118,6 +118,11 @@ def should_ytdlp(site, page, page_status, skip_av_seeds): return True +def isyoutubehost(url): + # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname + return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] + + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers @@ -231,31 +236,29 @@ def _build_youtube_dl(worker, destdir, site, page): worker._proxy_for(site), url, ) - try: - with open(info_dict["filepath"], "rb") as f: - # include content-length header to avoid chunked - # transfer, which warcprox currently rejects - extra_headers = dict(site.extra_headers()) - extra_headers["content-length"] = size - request, response = worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url=url, - warc_type="resource", - content_type=mimetype, - payload=f, - extra_headers=extra_headers, - ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } + with open(info_dict["filepath"], "rb") as f: + # include content-length header to avoid chunked + # transfer, which warcprox currently rejects + extra_headers = dict(site.extra_headers()) + extra_headers["content-length"] = size + request, response = worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url=url, + warc_type="resource", + content_type=mimetype, + payload=f, + extra_headers=extra_headers, ) - except: - traceback.print_exc() + + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed @@ -286,14 +289,9 @@ def _build_youtube_dl(worker, destdir, site, page): worker.logger.info( "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) ) - youtube_host = ( - "youtube.com" - in d["info_dict"]["webpage_url"] - .split("//")[-1] - .split("/")[0] - .split("?")[0] - ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"]) + + metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1) if worker._using_warcprox(site): _YoutubeDL._push_video_to_warcprox( _YoutubeDL, site, d["info_dict"], d["postprocessor"] @@ -332,15 +330,14 @@ def _build_youtube_dl(worker, destdir, site, page): } ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = ( - "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] - ) - if youtube_host and YTDLP_PROXY: + is_youtube_host = isyoutubehost(ytdlp_url) + if is_youtube_host and YTDLP_PROXY: ydl_opts["proxy"] = YTDLP_PROXY - ytdlp_proxy_for_print = ( + # don't log proxy value secrets + ytdlp_proxy_for_logs = ( YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" ) - logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_print) + logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) # skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges # if worker._proxy_for(site): @@ -351,7 +348,7 @@ def _build_youtube_dl(worker, destdir, site, page): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] ydl.url = ytdlp_url - ydl.youtube_host = youtube_host + ydl.is_youtube_host = is_youtube_host return ydl @@ -379,12 +376,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ydl.url) - # should_download_vid = not ydl.youtube_host + # should_download_vid = not ydl.is_youtube_host # then # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) - # if ydl.youtube_host and ie_result: + # if ydl.is_youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_attempts.labels(ydl.is_youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -393,7 +390,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ydl.url))) ) - metrics.brozzler_ydl_extract_successes.labels(ydl.youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -419,18 +416,22 @@ def _try_youtube_dl(worker, ydl, site, page): logging.warning( "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) - raise brozzler.ProxyError( - "yt-dlp hit possible external proxy error from %s" % ydl.url + raise brozzler.VideoExtractorError( + "yt-dlp hit error extracting info for %s" % ydl.url ) else: + retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, - YTDLP_WAIT, + retry_wait, ) - time.sleep(YTDLP_WAIT) + time.sleep(retry_wait) else: - raise brozzler.ProxyError("Proxy attempt(s) failed for unknown reason(s)") + raise brozzler.VideoExtractorError( + "yt-dlp hit unknown error extracting info for %s" % ydl.url + ) + logging.info("ytdlp completed successfully") _remember_videos(page, ydl.pushed_videos) diff --git a/setup.py b/setup.py index eb3a9c9..ecb592f 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.55a3", + version="1.5.55", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt",