mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
skip_av_seeds
This commit is contained in:
parent
c66fec4c6d
commit
2e560d3c18
@ -2,7 +2,7 @@
|
||||
"""
|
||||
brozzler/cli.py - brozzler command line executables
|
||||
|
||||
Copyright (C) 2014-2023 Internet Archive
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -544,12 +544,25 @@ def brozzler_worker(argv=None):
|
||||
finally:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
def get_skip_av_seeds():
|
||||
skip_av_seeds_file = "/opt/local/brozzler/skip_av_seeds.txt"
|
||||
try:
|
||||
with open(skip_av_seeds_file) as skips:
|
||||
skip_av_seeds = {x for x in skips.readlines()}
|
||||
logging.info("running with skip_av_seeds file %s" % skip_av_seeds_file)
|
||||
except Exception as e:
|
||||
skip_av_seeds = set()
|
||||
logging.info("running with empty skip_av_seeds")
|
||||
return skip_av_seeds
|
||||
|
||||
rr = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
skip_av_seeds = get_skip_av_seeds()
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier,
|
||||
service_registry,
|
||||
skip_av_seeds,
|
||||
max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe,
|
||||
proxy=args.proxy,
|
||||
|
@ -54,6 +54,7 @@ class BrozzlerWorker:
|
||||
self,
|
||||
frontier,
|
||||
service_registry=None,
|
||||
skip_av_seeds=None,
|
||||
max_browsers=1,
|
||||
chrome_exe="chromium-browser",
|
||||
warcprox_auto=False,
|
||||
@ -261,7 +262,7 @@ class BrozzlerWorker:
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(page, site):
|
||||
if enable_youtube_dl and ydl.should_ytdlp(self, site, page):
|
||||
try:
|
||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
outlinks.update(ydl_outlinks)
|
||||
|
127
brozzler/ydl.py
127
brozzler/ydl.py
@ -31,132 +31,25 @@ import threading
|
||||
|
||||
thread_local = threading.local()
|
||||
|
||||
skip_ytdlp_seeds = {
|
||||
166569,
|
||||
166570,
|
||||
166571,
|
||||
166572,
|
||||
166573,
|
||||
171054,
|
||||
577504,
|
||||
577505,
|
||||
577506,
|
||||
577507,
|
||||
577508,
|
||||
579556,
|
||||
588597,
|
||||
588599,
|
||||
588604,
|
||||
657452,
|
||||
680067,
|
||||
931642,
|
||||
1020763,
|
||||
1020845,
|
||||
1102795,
|
||||
1126155,
|
||||
1355999,
|
||||
1356000,
|
||||
1356001,
|
||||
1356002,
|
||||
1356003,
|
||||
1356004,
|
||||
1381601,
|
||||
1400183,
|
||||
1407124,
|
||||
1430611,
|
||||
1561452,
|
||||
2181615,
|
||||
2277187,
|
||||
2287692,
|
||||
2293805,
|
||||
2315198,
|
||||
2320887,
|
||||
2320889,
|
||||
2320890,
|
||||
2320891,
|
||||
2320892,
|
||||
2451964,
|
||||
2451965,
|
||||
2517850,
|
||||
2517851,
|
||||
2517852,
|
||||
2518225,
|
||||
2518226,
|
||||
2518227,
|
||||
2518228,
|
||||
2528222,
|
||||
2528223,
|
||||
2528224,
|
||||
2528225,
|
||||
2528227,
|
||||
2528800,
|
||||
2528801,
|
||||
2528802,
|
||||
2528803,
|
||||
2528847,
|
||||
2528848,
|
||||
2528849,
|
||||
2528850,
|
||||
2528851,
|
||||
2528852,
|
||||
2528853,
|
||||
2528854,
|
||||
2530393,
|
||||
2530394,
|
||||
2530395,
|
||||
2530396,
|
||||
2530397,
|
||||
2530398,
|
||||
2530399,
|
||||
2530400,
|
||||
2530401,
|
||||
2530402,
|
||||
2530403,
|
||||
2530404,
|
||||
2530408,
|
||||
2530409,
|
||||
2530410,
|
||||
2530411,
|
||||
2530412,
|
||||
2530413,
|
||||
2530414,
|
||||
2530415,
|
||||
2530416,
|
||||
2530417,
|
||||
2530418,
|
||||
2530419,
|
||||
2553200,
|
||||
2553201,
|
||||
2553202,
|
||||
2553203,
|
||||
2553204,
|
||||
2634329,
|
||||
2826641,
|
||||
2894571,
|
||||
2895333,
|
||||
3062930,
|
||||
3084847,
|
||||
3085989,
|
||||
3223637,
|
||||
3223656,
|
||||
}
|
||||
|
||||
|
||||
def should_ytdlp(page, site):
|
||||
def should_ytdlp(worker, site, page):
|
||||
# called only after we've passed needs_browsing() check
|
||||
if page.status_code != 200:
|
||||
logging.info("skipping ytdlp: non-200 page status")
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logging.info("skipping ytdlp: site marked skip_ytdp")
|
||||
logging.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
return False
|
||||
|
||||
ytdlp_seed = site["metadata"]["ait_seed_id"]
|
||||
ytdlp_seed = site["metadata"]["ait_seed_id"] if "metadata" in site and "ait_seed_id" in site["metadata"] else None
|
||||
|
||||
if ytdlp_seed in skip_ytdlp_seeds:
|
||||
logging.info("skipping ytdlp: site in skip_ytdlp_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
if ytdlp_seed and not site.skip_ytdlp:
|
||||
if ytdlp_seed in worker.skip_av_seeds:
|
||||
logging.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
else:
|
||||
site.skip_ytdlp = False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user