From a49b978c60a438ef65a29fb09ee79cbd7aab17be Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 9 Dec 2024 17:21:21 -0800 Subject: [PATCH 1/3] add ytdlp_tmp, and more... --- brozzler/ydl.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7ce2686..2aa6459 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,9 +33,10 @@ import time thread_local = threading.local() -YTDLP_PROXY = "" -MAX_YTDLP_ATTEMPTS = 4 -YTDLP_WAIT = 10 +ytdlp_proxy = "" +ytdlp_tmp = "/tmp" +ytdlp_wait = 10 +max_ytdlp_attempts = 4 def should_ytdlp(site, page, page_status, skip_av_seeds): @@ -284,11 +285,11 @@ def _build_youtube_dl(worker, destdir, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url is_youtube_host = isyoutubehost(ytdlp_url) - if is_youtube_host and YTDLP_PROXY: - ydl_opts["proxy"] = YTDLP_PROXY + if is_youtube_host and ytdlp_proxy: + ydl_opts["proxy"] = ytdlp_proxy # don't log proxy value secrets ytdlp_proxy_for_logs = ( - YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" + ytdlp_proxy.split("@")[1] if "@" in ytdlp_proxy else "@@@" ) logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) @@ -326,7 +327,7 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): attempt = 0 - while attempt < MAX_YTDLP_ATTEMPTS: + while attempt < max_ytdlp_attempts: try: logging.info("trying yt-dlp on %s", ydl.url) # should_download_vid = not ydl.is_youtube_host @@ -364,15 +365,15 @@ def _try_youtube_dl(worker, ydl, site, page): # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) # and others... attempt += 1 - if attempt == MAX_YTDLP_ATTEMPTS: + if attempt == max_ytdlp_attempts: logging.warning( - "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e + "Failed after %s attempts. Error: %s", max_ytdlp_attempts, e ) raise brozzler.VideoExtractorError( "yt-dlp hit error extracting info for %s" % ydl.url ) else: - retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1))) + retry_wait = min(60, ytdlp_wait * (1.5 ** (attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, @@ -420,7 +421,7 @@ def do_youtube_dl(worker, site, page): Returns: `list` of `str`: outlink urls """ - with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir="/tmp") as tempdir: + with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir=ytdlp_tmp) as tempdir: ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() From a86962e886c0bb39fb55c20899f2c377e916bb30 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 9 Dec 2024 19:29:06 -0800 Subject: [PATCH 2/3] cli param for ytldp_tmpdir --- brozzler/browser.py | 1 + brozzler/cli.py | 14 ++++++++++++++ brozzler/worker.py | 3 +++ brozzler/ydl.py | 4 ++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 4b602f1..aa8893e 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -483,6 +483,7 @@ class Browser: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, + ytdlp_tmpdir = '/tmp', simpler404=False, page_timeout=300, behavior_timeout=900, diff --git a/brozzler/cli.py b/brozzler/cli.py index 3cb7c9a..c0b7f5a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -265,6 +265,12 @@ def brozzle_page(argv=None): arg_parser.add_argument( "--skip-youtube-dl", dest="skip_youtube_dl", action="store_true" ) + arg_parser.add_argument( + "--ytdlp_tmpdir", + dest="ytdlp_tmpdir", + default="/tmp", + help="specify a temp dir for ytdlp; defaults to /tmp", + ) arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true") add_common_options(arg_parser, argv) @@ -292,6 +298,7 @@ def brozzle_page(argv=None): skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, + ytdlp_tmpdir = args.ytdlp_tmpdir, simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page, download_throughput=args.download_throughput, @@ -533,6 +540,12 @@ def brozzler_worker(argv=None): action="store_true", help=argparse.SUPPRESS, ) + arg_parser.add_argument( + "--ytdlp_tmpdir", + dest="ytdlp_tmpdir", + default="/tmp", + help="argparse.SUPPRESS", + ) arg_parser.add_argument( "--stealth", dest="stealth", @@ -613,6 +626,7 @@ def brozzler_worker(argv=None): skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, + ytdlp_tmpdir=args.ytdlp_tmpdir, stealth=args.stealth, metrics_port=args.metrics_port, registry_url=args.registry_url, diff --git a/brozzler/worker.py b/brozzler/worker.py index e71939d..490837f 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -64,6 +64,7 @@ class BrozzlerWorker: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, + ytdlp_tmpdir='/tmp', simpler404=False, screenshot_full_page=False, page_timeout=300, @@ -89,6 +90,7 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + self._ytdlp_tmpdir = ytdlp_tmpdir self._simpler404 = simpler404 self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout @@ -445,6 +447,7 @@ class BrozzlerWorker: skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, + ytdlp_tmpdir = self._ytdlp_tmpdir, simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2aa6459..0bc6b9d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,6 @@ import time thread_local = threading.local() ytdlp_proxy = "" -ytdlp_tmp = "/tmp" ytdlp_wait = 10 max_ytdlp_attempts = 4 @@ -421,7 +420,8 @@ def do_youtube_dl(worker, site, page): Returns: `list` of `str`: outlink urls """ - with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir=ytdlp_tmp) as tempdir: + with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir) as tempdir: + logging.info("using temporary directory: %s", tempdir) ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() From 789fe8116c43f2a504dbb8dd9b7bb19b8ecd6962 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 12 Dec 2024 11:48:26 -0800 Subject: [PATCH 3/3] skip separate edits mostly --- brozzler/ydl.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 0bc6b9d..1bd9e00 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,9 +33,9 @@ import time thread_local = threading.local() -ytdlp_proxy = "" -ytdlp_wait = 10 -max_ytdlp_attempts = 4 +YTDLP_PROXY = "" +MAX_YTDLP_ATTEMPTS = 4 +YTDLP_WAIT = 10 def should_ytdlp(site, page, page_status, skip_av_seeds): @@ -284,11 +284,11 @@ def _build_youtube_dl(worker, destdir, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url is_youtube_host = isyoutubehost(ytdlp_url) - if is_youtube_host and ytdlp_proxy: - ydl_opts["proxy"] = ytdlp_proxy + if is_youtube_host and YTDLP_PROXY: + ydl_opts["proxy"] = YTDLP_PROXY # don't log proxy value secrets ytdlp_proxy_for_logs = ( - ytdlp_proxy.split("@")[1] if "@" in ytdlp_proxy else "@@@" + YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" ) logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) @@ -326,7 +326,7 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): attempt = 0 - while attempt < max_ytdlp_attempts: + while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ydl.url) # should_download_vid = not ydl.is_youtube_host @@ -364,15 +364,15 @@ def _try_youtube_dl(worker, ydl, site, page): # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) # and others... attempt += 1 - if attempt == max_ytdlp_attempts: + if attempt == MAX_YTDLP_ATTEMPTS: logging.warning( - "Failed after %s attempts. Error: %s", max_ytdlp_attempts, e + "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) raise brozzler.VideoExtractorError( "yt-dlp hit error extracting info for %s" % ydl.url ) else: - retry_wait = min(60, ytdlp_wait * (1.5 ** (attempt - 1))) + retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1))) logging.info( "Attempt %s failed. Retrying in %s seconds...", attempt, @@ -421,7 +421,7 @@ def do_youtube_dl(worker, site, page): `list` of `str`: outlink urls """ with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir) as tempdir: - logging.info("using temporary directory: %s", tempdir) + logging.info("tempdir for yt-dlp: %s", tempdir) ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set()