Merge branch 'rotary_skip_ytdlp' into qa

This commit is contained in:
Barbara Miller 2024-05-29 17:30:07 -07:00
commit f9ad480fae
6 changed files with 29 additions and 118 deletions

View File

@ -52,6 +52,7 @@
-
url_regex: '^https?://www.opengov\.nsw\.gov\.au/publications.*$'
behavior_js_template: umbraBehavior.js.j2
request_idle_timeout_sec: 10
default_parameters:
interval: 1000
actions:

View File

@ -2,7 +2,7 @@
"""
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2023 Internet Archive
Copyright (C) 2014-2024 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -544,12 +544,25 @@ def brozzler_worker(argv=None):
finally:
signal.signal(signal.SIGQUIT, dump_state)
def get_skip_av_seeds():
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
with open(skip_av_seeds_file) as skips:
skip_av_seeds = set(skips.readlines())
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds
rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,

View File

@ -2,7 +2,7 @@
brozzler/models.py - model classes representing jobs, sites, and pages, with
related logic
Copyright (C) 2014-2022 Internet Archive
Copyright (C) 2014-2024 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -235,6 +235,8 @@ class Site(doublethink.Document, ElapsedMixIn):
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
self.scope = {}
if not "skip_ytdlp" in self:
self.skip_ytdlp = False
# backward compatibility
if "surt" in self.scope:

View File

@ -54,6 +54,7 @@ class BrozzlerWorker:
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
@ -262,7 +263,7 @@ class BrozzlerWorker:
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp(page, site):
if enable_youtube_dl and ydl.should_ytdlp(self, site, page):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)

View File

@ -35,116 +35,6 @@ import doublethink
thread_local = threading.local()
skip_ytdlp_seeds = {
166569,
166570,
166571,
166572,
166573,
171054,
577504,
577505,
577506,
577507,
577508,
579556,
588597,
588599,
588604,
657452,
680067,
931642,
1020763,
1020845,
1102795,
1126155,
1355999,
1356000,
1356001,
1356002,
1356003,
1356004,
1381601,
1400183,
1407124,
1430611,
1561452,
2181615,
2277187,
2287692,
2293805,
2315198,
2320887,
2320889,
2320890,
2320891,
2320892,
2451964,
2451965,
2517850,
2517851,
2517852,
2518225,
2518226,
2518227,
2518228,
2528222,
2528223,
2528224,
2528225,
2528227,
2528800,
2528801,
2528802,
2528803,
2528847,
2528848,
2528849,
2528850,
2528851,
2528852,
2528853,
2528854,
2530393,
2530394,
2530395,
2530396,
2530397,
2530398,
2530399,
2530400,
2530401,
2530402,
2530403,
2530404,
2530408,
2530409,
2530410,
2530411,
2530412,
2530413,
2530414,
2530415,
2530416,
2530417,
2530418,
2530419,
2553200,
2553201,
2553202,
2553203,
2553204,
2634329,
2826641,
2894571,
2895333,
3062930,
3084847,
3085989,
3223637,
3223656,
}
def _timestamp4datetime(timestamp):
"""split `timestamp` into a tuple of 6 integers.
@ -167,13 +57,17 @@ def should_ytdlp(page, site):
logging.info("skipping ytdlp: non-200 page status")
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdp")
logging.info("skipping ytdlp: site marked skip_ytdlp")
return False
ytdlp_seed = site["metadata"]["ait_seed_id"]
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
if ytdlp_seed in skip_ytdlp_seeds:
logging.info("skipping ytdlp: site in skip_ytdlp_seeds")
if ytdlp_seed and ytdlp_seed in worker.skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup(
name="brozzler",
version="1.5.48a0",
version="1.5.49a0",
description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler",
author="Noah Levitt",