diff --git a/brozzler/cli.py b/brozzler/cli.py index 7523008..bea5153 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -2,7 +2,7 @@ """ brozzler/cli.py - brozzler command line executables -Copyright (C) 2014-2023 Internet Archive +Copyright (C) 2014-2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -544,12 +544,27 @@ def brozzler_worker(argv=None): finally: signal.signal(signal.SIGQUIT, dump_state) + def get_skip_av_seeds(): + # TODO: develop UI and refactor + SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt" + try: + # make set from seed IDs in SKIP_AV_SEEDS_FILE + with open(SKIP_AV_SEEDS_FILE) as skips: + skip_av_seeds = {int(l) for l in skips.readlines()} + logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE) + except Exception as e: + skip_av_seeds = set() + logging.info("running with empty skip_av_seeds") + return skip_av_seeds + rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) + skip_av_seeds_from_file = get_skip_av_seeds() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, + skip_av_seeds=skip_av_seeds_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/model.py b/brozzler/model.py index b0f216d..fe9f8c0 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -2,7 +2,7 @@ brozzler/models.py - model classes representing jobs, sites, and pages, with related logic -Copyright (C) 2014-2022 Internet Archive +Copyright (C) 2014-2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -235,6 +235,8 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} + if not "skip_ytdlp" in self: + self.skip_ytdlp = None # backward compatibility if "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 2bad677..633ace7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -54,6 +54,7 @@ class BrozzlerWorker: self, frontier, service_registry=None, + skip_av_seeds=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -73,6 +74,7 @@ class BrozzlerWorker: ): self._frontier = frontier self._service_registry = service_registry + self._skip_av_seeds = skip_av_seeds self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -261,7 +263,7 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) - if enable_youtube_dl and ydl.should_ytdlp(page, site): + if enable_youtube_dl and ydl.should_ytdlp(site, page, self._skip_av_seeds): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) outlinks.update(ydl_outlinks) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 9caf662..361c693 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -32,9 +32,14 @@ import threading thread_local = threading.local() -def should_ytdlp(page, site): +def should_ytdlp(site, page, skip_av_seeds): # called only after we've passed needs_browsing() check + if page.status_code != 200: + logging.info("skipping ytdlp: non-200 page status") + return False + if site.skip_ytdlp: + logging.info("skipping ytdlp: site marked skip_ytdlp") return False ytdlp_url = page.redirect_url if page.redirect_url else page.url @@ -42,6 +47,21 @@ def should_ytdlp(page, site): if "chrome-error:" in ytdlp_url: return False + ytdlp_seed = ( + site["metadata"]["ait_seed_id"] + if "metadata" in site and "ait_seed_id" in site["metadata"] + else None + ) + + # TODO: develop UI and refactor + if ytdlp_seed: + if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: + logging.info("skipping ytdlp: site in skip_av_seeds") + site.skip_ytdlp = True + return False + else: + site.skip_ytdlp = False + return True