diff --git a/brozzler/cli.py b/brozzler/cli.py index 60aee88..f3bfe5e 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -545,10 +545,12 @@ def brozzler_worker(argv=None): signal.signal(signal.SIGQUIT, dump_state) def get_skip_av_seeds(): + # TODO: develop UI and refactor SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt" try: - with open(skip_av_seeds_file) as skips: - skip_av_seeds = set(skips.readlines()) + # make set from seed IDs in SKIP_AV_SEEDS_FILE + with open(SKIP_AV_SEEDS_FILE) as skips: + skip_av_seeds = {int(l) for l in skips.readlines()} logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE) except Exception as e: skip_av_seeds = set() @@ -562,7 +564,7 @@ def brozzler_worker(argv=None): worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, - skip_av_seeds, + skip_av_seeds=skip_av_seeds, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index 738107a..2b7f9fa 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -262,7 +262,7 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) - if enable_youtube_dl and ydl.should_ytdlp(self, site, page): + if enable_youtube_dl and ydl.should_ytdlp(site, page, self.skip_av_seeds): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) outlinks.update(ydl_outlinks) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a58fa98..4d53cb1 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -32,7 +32,7 @@ import threading thread_local = threading.local() -def should_ytdlp(worker, site, page): +def should_ytdlp(site, page, skip_av_seeds): # called only after we've passed needs_browsing() check if page.status_code != 200: logging.info("skipping ytdlp: non-200 page status") @@ -41,22 +41,23 @@ def should_ytdlp(worker, site, page): logging.info("skipping ytdlp: site marked skip_ytdlp") return False + ytdlp_url = page.redirect_url if page.redirect_url else page.url + + if "chrome-error:" in ytdlp_url: + return False + ytdlp_seed = ( site["metadata"]["ait_seed_id"] if "metadata" in site and "ait_seed_id" in site["metadata"] else None ) - if ytdlp_seed and ytdlp_seed in worker.skip_av_seeds: + # TODO: develop UI and refactor + if ytdlp_seed and ytdlp_seed in skip_av_seeds: logging.info("skipping ytdlp: site in skip_av_seeds") site.skip_ytdlp = True return False - ytdlp_url = page.redirect_url if page.redirect_url else page.url - - if "chrome-error:" in ytdlp_url: - return False - return True