diff --git a/brozzler/cli.py b/brozzler/cli.py index 982becd..afbefdb 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -129,9 +129,10 @@ def configure_logging(args): def suggest_default_chrome_exe(): # mac os x application executable paths for path in [ - '/Applications/Thorium.app/Contents/MacOS/Thorium', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome']: + "/Applications/Thorium.app/Contents/MacOS/Thorium", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + ]: if os.path.exists(path): return path @@ -598,30 +599,31 @@ def brozzler_worker(argv=None): finally: signal.signal(signal.SIGQUIT, dump_state) - def get_proxy_endpoints(): - PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt" + def get_ytdlp_proxy_endpoints(): + YTDLP_PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/ytdlp_proxy_endpoints.txt" try: # make list from file - with open(PROXY_ENDPOINTS_FILE) as endpoints: - proxy_endpoints = [l for l in endpoints.readlines()] - if proxy_endpoints: + with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints: + ytdlp_proxy_endpoints = [l for l in endpoints.readlines()] + if ytdlp_proxy_endpoints: logging.info( - "running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE + "running with ytdlp proxy endpoints file %s" + % YTDLP_PROXY_ENDPOINTS_FILE ) except Exception as e: - proxy_endpoints = [] + ytdlp_proxy_endpoints = [] logging.info("running with empty proxy endpoints file") - return proxy_endpoints + return ytdlp_proxy_endpoints rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) skip_av_seeds_from_file = get_skip_av_seeds() - proxy_endpoints_from_file = get_proxy_endpoints() + ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, - proxy_endpoints=proxy_endpoints_from_file, + ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index a20fe99..7c3b3b0 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning it runs yt-dlp on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier -Copyright (C) 2014-2024 Internet Archive +Copyright (C) 2014-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -57,7 +57,7 @@ class BrozzlerWorker: self, frontier, service_registry=None, - proxy_endpoints=None, + ytdlp_proxy_endpoints=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -81,7 +81,7 @@ class BrozzlerWorker: ): self._frontier = frontier self._service_registry = service_registry - self._proxy_endpoints = proxy_endpoints + self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -298,11 +298,11 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) - if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._proxy_endpoints - ): + if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code): try: - ydl_outlinks = ydl.do_youtube_dl(self, site, page) + ydl_outlinks = ydl.do_youtube_dl( + self, site, page, self._ytdlp_proxy_endpoints + ) metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 1378928..ead919e 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,7 +1,7 @@ """ brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler -Copyright (C) 2024 Internet Archive +Copyright (C) 2024-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ import datetime from . import metrics +import random import threading import traceback import doublethink @@ -42,7 +43,8 @@ PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 -def should_ytdlp(site, page, page_status, proxy_endpoints): + +def should_ytdlp(site, page, page_status): # called only after we've passed needs_browsing() check if page_status != 200: @@ -62,6 +64,7 @@ def should_ytdlp(site, page, page_status, proxy_endpoints): return True + def isyoutubehost(url): # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] @@ -80,7 +83,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler): return req -def _build_youtube_dl(worker, destdir, site, page): +def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. @@ -297,8 +300,8 @@ def _build_youtube_dl(worker, destdir, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url is_youtube_host = isyoutubehost(ytdlp_url) - if is_youtube_host and proxy_endpoints: - ydl_opts["proxy"] = random.choice(proxy_endpoints) + if is_youtube_host and ytdlp_proxy_endpoints: + ydl_opts["proxy"] = random.choice(ytdlp_proxy_endpoints) # don't log proxy value secrets ytdlp_proxy_for_logs = ( ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@" @@ -425,7 +428,7 @@ def _try_youtube_dl(worker, ydl, site, page): @metrics.brozzler_ytdlp_duration_seconds.time() @metrics.brozzler_in_progress_ytdlps.track_inprogress() -def do_youtube_dl(worker, site, page): +def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): """ Runs yt-dlp configured for `worker` and `site` to download videos from `page`. @@ -442,7 +445,7 @@ def do_youtube_dl(worker, site, page): prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir ) as tempdir: logging.info("tempdir for yt-dlp: %s", tempdir) - ydl = _build_youtube_dl(worker, tempdir, site, page) + ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() if ie_result and (