mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge branch 'rotary_skip_ytdlp' into qa
This commit is contained in:
commit
f9ad480fae
@ -52,6 +52,7 @@
|
||||
-
|
||||
url_regex: '^https?://www.opengov\.nsw\.gov\.au/publications.*$'
|
||||
behavior_js_template: umbraBehavior.js.j2
|
||||
request_idle_timeout_sec: 10
|
||||
default_parameters:
|
||||
interval: 1000
|
||||
actions:
|
||||
|
@ -2,7 +2,7 @@
|
||||
"""
|
||||
brozzler/cli.py - brozzler command line executables
|
||||
|
||||
Copyright (C) 2014-2023 Internet Archive
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -544,12 +544,25 @@ def brozzler_worker(argv=None):
|
||||
finally:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
def get_skip_av_seeds():
|
||||
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
|
||||
try:
|
||||
with open(skip_av_seeds_file) as skips:
|
||||
skip_av_seeds = set(skips.readlines())
|
||||
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
|
||||
except Exception as e:
|
||||
skip_av_seeds = set()
|
||||
logging.info("running with empty skip_av_seeds")
|
||||
return skip_av_seeds
|
||||
|
||||
rr = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
skip_av_seeds = get_skip_av_seeds()
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier,
|
||||
service_registry,
|
||||
skip_av_seeds,
|
||||
max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe,
|
||||
proxy=args.proxy,
|
||||
|
@ -2,7 +2,7 @@
|
||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||
related logic
|
||||
|
||||
Copyright (C) 2014-2022 Internet Archive
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -235,6 +235,8 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if not "scope" in self:
|
||||
self.scope = {}
|
||||
if not "skip_ytdlp" in self:
|
||||
self.skip_ytdlp = False
|
||||
|
||||
# backward compatibility
|
||||
if "surt" in self.scope:
|
||||
|
@ -54,6 +54,7 @@ class BrozzlerWorker:
|
||||
self,
|
||||
frontier,
|
||||
service_registry=None,
|
||||
skip_av_seeds=None,
|
||||
max_browsers=1,
|
||||
chrome_exe="chromium-browser",
|
||||
warcprox_auto=False,
|
||||
@ -262,7 +263,7 @@ class BrozzlerWorker:
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(page, site):
|
||||
if enable_youtube_dl and ydl.should_ytdlp(self, site, page):
|
||||
try:
|
||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
outlinks.update(ydl_outlinks)
|
||||
|
122
brozzler/ydl.py
122
brozzler/ydl.py
@ -35,116 +35,6 @@ import doublethink
|
||||
|
||||
thread_local = threading.local()
|
||||
|
||||
skip_ytdlp_seeds = {
|
||||
166569,
|
||||
166570,
|
||||
166571,
|
||||
166572,
|
||||
166573,
|
||||
171054,
|
||||
577504,
|
||||
577505,
|
||||
577506,
|
||||
577507,
|
||||
577508,
|
||||
579556,
|
||||
588597,
|
||||
588599,
|
||||
588604,
|
||||
657452,
|
||||
680067,
|
||||
931642,
|
||||
1020763,
|
||||
1020845,
|
||||
1102795,
|
||||
1126155,
|
||||
1355999,
|
||||
1356000,
|
||||
1356001,
|
||||
1356002,
|
||||
1356003,
|
||||
1356004,
|
||||
1381601,
|
||||
1400183,
|
||||
1407124,
|
||||
1430611,
|
||||
1561452,
|
||||
2181615,
|
||||
2277187,
|
||||
2287692,
|
||||
2293805,
|
||||
2315198,
|
||||
2320887,
|
||||
2320889,
|
||||
2320890,
|
||||
2320891,
|
||||
2320892,
|
||||
2451964,
|
||||
2451965,
|
||||
2517850,
|
||||
2517851,
|
||||
2517852,
|
||||
2518225,
|
||||
2518226,
|
||||
2518227,
|
||||
2518228,
|
||||
2528222,
|
||||
2528223,
|
||||
2528224,
|
||||
2528225,
|
||||
2528227,
|
||||
2528800,
|
||||
2528801,
|
||||
2528802,
|
||||
2528803,
|
||||
2528847,
|
||||
2528848,
|
||||
2528849,
|
||||
2528850,
|
||||
2528851,
|
||||
2528852,
|
||||
2528853,
|
||||
2528854,
|
||||
2530393,
|
||||
2530394,
|
||||
2530395,
|
||||
2530396,
|
||||
2530397,
|
||||
2530398,
|
||||
2530399,
|
||||
2530400,
|
||||
2530401,
|
||||
2530402,
|
||||
2530403,
|
||||
2530404,
|
||||
2530408,
|
||||
2530409,
|
||||
2530410,
|
||||
2530411,
|
||||
2530412,
|
||||
2530413,
|
||||
2530414,
|
||||
2530415,
|
||||
2530416,
|
||||
2530417,
|
||||
2530418,
|
||||
2530419,
|
||||
2553200,
|
||||
2553201,
|
||||
2553202,
|
||||
2553203,
|
||||
2553204,
|
||||
2634329,
|
||||
2826641,
|
||||
2894571,
|
||||
2895333,
|
||||
3062930,
|
||||
3084847,
|
||||
3085989,
|
||||
3223637,
|
||||
3223656,
|
||||
}
|
||||
|
||||
|
||||
def _timestamp4datetime(timestamp):
|
||||
"""split `timestamp` into a tuple of 6 integers.
|
||||
@ -167,13 +57,17 @@ def should_ytdlp(page, site):
|
||||
logging.info("skipping ytdlp: non-200 page status")
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logging.info("skipping ytdlp: site marked skip_ytdp")
|
||||
logging.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
return False
|
||||
|
||||
ytdlp_seed = site["metadata"]["ait_seed_id"]
|
||||
ytdlp_seed = (
|
||||
site["metadata"]["ait_seed_id"]
|
||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
||||
else None
|
||||
)
|
||||
|
||||
if ytdlp_seed in skip_ytdlp_seeds:
|
||||
logging.info("skipping ytdlp: site in skip_ytdlp_seeds")
|
||||
if ytdlp_seed and ytdlp_seed in worker.skip_av_seeds:
|
||||
logging.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user