From a86962e886c0bb39fb55c20899f2c377e916bb30 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 9 Dec 2024 19:29:06 -0800 Subject: [PATCH] cli param for ytldp_tmpdir --- brozzler/browser.py | 1 + brozzler/cli.py | 14 ++++++++++++++ brozzler/worker.py | 3 +++ brozzler/ydl.py | 4 ++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 4b602f1..aa8893e 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -483,6 +483,7 @@ class Browser: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, + ytdlp_tmpdir = '/tmp', simpler404=False, page_timeout=300, behavior_timeout=900, diff --git a/brozzler/cli.py b/brozzler/cli.py index 3cb7c9a..c0b7f5a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -265,6 +265,12 @@ def brozzle_page(argv=None): arg_parser.add_argument( "--skip-youtube-dl", dest="skip_youtube_dl", action="store_true" ) + arg_parser.add_argument( + "--ytdlp_tmpdir", + dest="ytdlp_tmpdir", + default="/tmp", + help="specify a temp dir for ytdlp; defaults to /tmp", + ) arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true") add_common_options(arg_parser, argv) @@ -292,6 +298,7 @@ def brozzle_page(argv=None): skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, + ytdlp_tmpdir = args.ytdlp_tmpdir, simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page, download_throughput=args.download_throughput, @@ -533,6 +540,12 @@ def brozzler_worker(argv=None): action="store_true", help=argparse.SUPPRESS, ) + arg_parser.add_argument( + "--ytdlp_tmpdir", + dest="ytdlp_tmpdir", + default="/tmp", + help="argparse.SUPPRESS", + ) arg_parser.add_argument( "--stealth", dest="stealth", @@ -613,6 +626,7 @@ def brozzler_worker(argv=None): skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, + ytdlp_tmpdir=args.ytdlp_tmpdir, stealth=args.stealth, metrics_port=args.metrics_port, registry_url=args.registry_url, diff --git a/brozzler/worker.py b/brozzler/worker.py index e71939d..490837f 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -64,6 +64,7 @@ class BrozzlerWorker: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, + ytdlp_tmpdir='/tmp', simpler404=False, screenshot_full_page=False, page_timeout=300, @@ -89,6 +90,7 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + self._ytdlp_tmpdir = ytdlp_tmpdir self._simpler404 = simpler404 self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout @@ -445,6 +447,7 @@ class BrozzlerWorker: skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, + ytdlp_tmpdir = self._ytdlp_tmpdir, simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2aa6459..0bc6b9d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,6 @@ import time thread_local = threading.local() ytdlp_proxy = "" -ytdlp_tmp = "/tmp" ytdlp_wait = 10 max_ytdlp_attempts = 4 @@ -421,7 +420,8 @@ def do_youtube_dl(worker, site, page): Returns: `list` of `str`: outlink urls """ - with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir=ytdlp_tmp) as tempdir: + with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir) as tempdir: + logging.info("using temporary directory: %s", tempdir) ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set()