mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 23:35:54 -04:00
Merge pull request #277 from galgeek/rotary_skip_ytdlp
skip ytdlp for selected seeds
This commit is contained in:
commit
42c5e6f559
@ -2,7 +2,7 @@
|
||||
"""
|
||||
brozzler/cli.py - brozzler command line executables
|
||||
|
||||
Copyright (C) 2014-2023 Internet Archive
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -544,12 +544,27 @@ def brozzler_worker(argv=None):
|
||||
finally:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
def get_skip_av_seeds():
|
||||
# TODO: develop UI and refactor
|
||||
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
|
||||
try:
|
||||
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
||||
with open(SKIP_AV_SEEDS_FILE) as skips:
|
||||
skip_av_seeds = {int(l) for l in skips.readlines()}
|
||||
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
|
||||
except Exception as e:
|
||||
skip_av_seeds = set()
|
||||
logging.info("running with empty skip_av_seeds")
|
||||
return skip_av_seeds
|
||||
|
||||
rr = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
skip_av_seeds_from_file = get_skip_av_seeds()
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier,
|
||||
service_registry,
|
||||
skip_av_seeds=skip_av_seeds_from_file,
|
||||
max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe,
|
||||
proxy=args.proxy,
|
||||
|
@ -2,7 +2,7 @@
|
||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||
related logic
|
||||
|
||||
Copyright (C) 2014-2022 Internet Archive
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -235,6 +235,8 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if not "scope" in self:
|
||||
self.scope = {}
|
||||
if not "skip_ytdlp" in self:
|
||||
self.skip_ytdlp = None
|
||||
|
||||
# backward compatibility
|
||||
if "surt" in self.scope:
|
||||
|
@ -54,6 +54,7 @@ class BrozzlerWorker:
|
||||
self,
|
||||
frontier,
|
||||
service_registry=None,
|
||||
skip_av_seeds=None,
|
||||
max_browsers=1,
|
||||
chrome_exe="chromium-browser",
|
||||
warcprox_auto=False,
|
||||
@ -73,6 +74,7 @@ class BrozzlerWorker:
|
||||
):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._skip_av_seeds = skip_av_seeds
|
||||
self._max_browsers = max_browsers
|
||||
|
||||
self._warcprox_auto = warcprox_auto
|
||||
@ -261,7 +263,7 @@ class BrozzlerWorker:
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(page, site):
|
||||
if enable_youtube_dl and ydl.should_ytdlp(site, page, self._skip_av_seeds):
|
||||
try:
|
||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
outlinks.update(ydl_outlinks)
|
||||
|
@ -32,9 +32,14 @@ import threading
|
||||
thread_local = threading.local()
|
||||
|
||||
|
||||
def should_ytdlp(page, site):
|
||||
def should_ytdlp(site, page, skip_av_seeds):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page.status_code != 200:
|
||||
logging.info("skipping ytdlp: non-200 page status")
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logging.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
return False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
@ -42,6 +47,21 @@ def should_ytdlp(page, site):
|
||||
if "chrome-error:" in ytdlp_url:
|
||||
return False
|
||||
|
||||
ytdlp_seed = (
|
||||
site["metadata"]["ait_seed_id"]
|
||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
||||
else None
|
||||
)
|
||||
|
||||
# TODO: develop UI and refactor
|
||||
if ytdlp_seed:
|
||||
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
|
||||
logging.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
else:
|
||||
site.skip_ytdlp = False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user