diff --git a/brozzler/cli.py b/brozzler/cli.py index bea5153..ba6b702 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -165,6 +165,20 @@ class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter return super()._get_help_string(action) +import enum + + +class Env(str, enum.Enum): + """Values of the Prometheus ``env`` label applied to a + :py:class:`.Registration` indicating the deployment environment in which + the service being advertised is operating. + """ + + qa = "qa" + prod = "prod" + dev = "dev" + + def brozzle_page(argv=None): """ Command line utility entry point for brozzling a single page. Opens url in @@ -234,6 +248,24 @@ def brozzle_page(argv=None): action="store_true", help="Try to avoid web bot detection", ) + arg_parser.add_argument( + "--registry_url", + dest="registry_url", + default=None, + help="Prometheus registry url", + ) + arg_parser.add_argument( + "--metrics_port", + dest=metrics_port, + default=8889, + help="Prometheus metrics port", + ) + arg_parser.add_argument( + "--env", + dest=env, + default=Env.dev, + help="Prometheus env value", + ) arg_parser.add_argument( "--screenshot-full-page", dest="screenshot_full_page", action="store_true" ) @@ -279,6 +311,9 @@ def brozzle_page(argv=None): window_height=args.window_height, window_width=args.window_width, stealth=args.stealth, + registry_url=args.registry_url, + metrics_port=args.metrics_port, + env=args.env, ) def on_screenshot(screenshot_jpeg): @@ -517,6 +552,24 @@ def brozzler_worker(argv=None): action="store_true", help="Try to avoid web bot detection", ) + arg_parser.add_argument( + "--registry_url", + dest="registry_url", + default=None, + help="Prometheus registry url", + ) + arg_parser.add_argument( + "--metrics_port", + dest=metrics_port, + default=8888, + help="Prometheus metrics port", + ) + arg_parser.add_argument( + "--env", + dest=env, + default=Env.qa, + help="Prometheus env value", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -573,6 +626,9 @@ def brozzler_worker(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, stealth=args.stealth, + registry_url=args.registry_url, + metrics_port=args.metrics_port, + env=args.env, ) signal.signal(signal.SIGQUIT, dump_state) diff --git a/brozzler/metrics.py b/brozzler/metrics.py index 9fa3321..cbfa768 100644 --- a/brozzler/metrics.py +++ b/brozzler/metrics.py @@ -18,7 +18,7 @@ except ImportError: class Env(str, enum.Enum): """Values of the Prometheus ``env`` label applied to a :py:class:`.Registration` indicating the deployment environment in which - the the service being advertised is operating. + the service being advertised is operating. """ qa = "qa" diff --git a/brozzler/worker.py b/brozzler/worker.py index 3f3c04a..0d221c6 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -28,7 +28,6 @@ import json import PIL.Image import io import socket -import platform import random import requests import doublethink @@ -42,18 +41,6 @@ from . import ydl r = rdb.RethinkDB() -# Setup metrics -registry_url = None -metrics_port = 8090 -env = metrics.Env.dev -hostname = platform.node() -if hostname.endswith("archive.org"): - registry_url = "http://wbgrp-svc283.us.archive.org:8888" - metrics_port = settings.metrics_port - env = metrics.Env.qa -metrics.register_prom_metrics(registry_url, metrics_port, env) - - class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -85,6 +72,9 @@ class BrozzlerWorker: stealth=False, window_height=900, window_width=1400, + registry_url=None, + metrics_port=None, + env=None, ): self._frontier = frontier self._service_registry = service_registry @@ -107,6 +97,9 @@ class BrozzlerWorker: self._window_height = window_height self._window_width = window_width self._stealth = stealth + self._registry_url = registry_url + self._metrics_port = metrics_port + self._env = env self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True @@ -118,6 +111,12 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() + # Setup metrics + registry_url = self._registry_url + metrics_port = self._metrics_port + env = self._env + metrics.register_prom_metrics(registry_url, metrics_port, env) + def _choose_warcprox(self): warcproxes = self._service_registry.available_services("warcprox") if not warcproxes: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2f07e9d..dcae69b 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -297,7 +297,7 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split('?')[0] + ytdlp_host = ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] try: logging.info("trying yt-dlp on %s", ytdlp_url) metrics.brozzler_ydl_download_attempts.labels(ytdlp_host).inc(1)