diff --git a/brozzler/cli.py b/brozzler/cli.py index ca166bf..e44d9f4 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -545,10 +545,12 @@ def brozzler_worker(argv=None): signal.signal(signal.SIGQUIT, dump_state) def get_skip_av_seeds(): + # TODO: develop UI and refactor SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt" try: - with open(skip_av_seeds_file) as skips: - skip_av_seeds = set(skips.readlines()) + # make set from seed IDs in SKIP_AV_SEEDS_FILE + with open(SKIP_AV_SEEDS_FILE) as skips: + skip_av_seeds = {int(l) for l in skips.readlines()} logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE) except Exception as e: skip_av_seeds = set() @@ -562,7 +564,7 @@ def brozzler_worker(argv=None): worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, - skip_av_seeds, + skip_av_seeds=skip_av_seeds, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index 3ff4609..da046bc 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -263,7 +263,7 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) - if enable_youtube_dl and ydl.should_ytdlp(self, site, page): + if enable_youtube_dl and ydl.should_ytdlp(site, page, self.skip_av_seeds): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) outlinks.update(ydl_outlinks) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 9d175f9..3d217c0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -51,7 +51,7 @@ def _timestamp4datetime(timestamp): int(timestamp[-2:]) ) -def should_ytdlp(worker, site, page): +def should_ytdlp(site, page, skip_av_seeds): # called only after we've passed needs_browsing() check if page.status_code != 200: logging.info("skipping ytdlp: non-200 page status") @@ -71,7 +71,8 @@ def should_ytdlp(worker, site, page): else None ) - if ytdlp_seed and ytdlp_seed in worker.skip_av_seeds: + # TODO: develop UI and refactor + if ytdlp_seed and ytdlp_seed in skip_av_seeds: logging.info("skipping ytdlp: site in skip_av_seeds") site.skip_ytdlp = True return False