WT-2950 remove skip_av_seeds

This commit is contained in:
Gretchen Miller 2024-09-12 16:45:52 -07:00
parent 0d8721a4d3
commit d9ed5c434a
3 changed files with 2 additions and 34 deletions

View File

@ -544,27 +544,12 @@ def brozzler_worker(argv=None):
finally:
signal.signal(signal.SIGQUIT, dump_state)
def get_skip_av_seeds():
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds
rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,

View File

@ -54,7 +54,6 @@ class BrozzlerWorker:
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
@ -74,7 +73,6 @@ class BrozzlerWorker:
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers
self._warcprox_auto = warcprox_auto
@ -262,7 +260,7 @@ class BrozzlerWorker:
self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp(
site, page, browser.websock_thread.page_status, self._skip_av_seeds
site, page, browser.websock_thread.page_status
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)

View File

@ -32,7 +32,7 @@ import threading
thread_local = threading.local()
def should_ytdlp(site, page, page_status, skip_av_seeds):
def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check
if page_status != 200:
@ -47,21 +47,6 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
if "chrome-error:" in ytdlp_url:
return False
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False
return True