diff --git a/brozzler/cli.py b/brozzler/cli.py index 32d08b2..dc1f611 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -2,7 +2,7 @@ """ brozzler/cli.py - brozzler command line executables -Copyright (C) 2014-2024 Internet Archive +Copyright (C) 2014-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -611,14 +611,29 @@ def brozzler_worker(argv=None): logging.info("running with empty skip_av_seeds") return skip_av_seeds + def get_proxy_endpoints(): + PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt" + try: + # make list from file + with open(PROXY_ENDPOINTS_FILE) as endpoints: + proxy_endpoints = [l for l in endpoints.readlines()] + if proxy_endpoints: + logging.info("running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE) + except Exception as e: + proxy_endpoints = [] + logging.info("running with empty proxy endpoints file") + return proxy_endpoints + rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) skip_av_seeds_from_file = get_skip_av_seeds() + proxy_endpoints_from_file = get_proxy_endpoints() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, skip_av_seeds=skip_av_seeds_from_file, + proxy_endpoints=proxy_endpoints_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index 4abfcd5..7428619 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -57,6 +57,7 @@ class BrozzlerWorker: frontier, service_registry=None, skip_av_seeds=None, + proxy_endpoints=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -81,6 +82,7 @@ class BrozzlerWorker: self._frontier = frontier self._service_registry = service_registry self._skip_av_seeds = skip_av_seeds + self._proxy_endpoints = proxy_endpoints self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -287,7 +289,7 @@ class BrozzlerWorker: self.logger.info("page interstitial shown (http auth): %s", page) if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._skip_av_seeds + site, page, status_code, self._skip_av_seeds, self._proxy_endpoints ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 861b6e6..f2b0696 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,12 +34,11 @@ import time thread_local = threading.local() -YTDLP_PROXY = "" PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 -def should_ytdlp(site, page, page_status, skip_av_seeds): +def should_ytdlp(site, page, page_status, skip_av_seeds, proxy_endpoints): # called only after we've passed needs_browsing() check if page_status != 200: @@ -285,11 +284,11 @@ def _build_youtube_dl(worker, destdir, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url is_youtube_host = isyoutubehost(ytdlp_url) - if is_youtube_host and YTDLP_PROXY: - ydl_opts["proxy"] = YTDLP_PROXY + if is_youtube_host and proxy_endpoints: + ydl_opts["proxy"] = random.choice(proxy_endpoints) # don't log proxy value secrets ytdlp_proxy_for_logs = ( - YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" + ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@" ) logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)