mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-08 14:32:23 -04:00
yt-dlp proxy handling update
This commit is contained in:
parent
eb922f5155
commit
170377fe89
3 changed files with 23 additions and 7 deletions
|
@ -2,7 +2,7 @@
|
||||||
"""
|
"""
|
||||||
brozzler/cli.py - brozzler command line executables
|
brozzler/cli.py - brozzler command line executables
|
||||||
|
|
||||||
Copyright (C) 2014-2024 Internet Archive
|
Copyright (C) 2014-2025 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
@ -611,14 +611,29 @@ def brozzler_worker(argv=None):
|
||||||
logging.info("running with empty skip_av_seeds")
|
logging.info("running with empty skip_av_seeds")
|
||||||
return skip_av_seeds
|
return skip_av_seeds
|
||||||
|
|
||||||
|
def get_proxy_endpoints():
|
||||||
|
PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt"
|
||||||
|
try:
|
||||||
|
# make list from file
|
||||||
|
with open(PROXY_ENDPOINTS_FILE) as endpoints:
|
||||||
|
proxy_endpoints = [l for l in endpoints.readlines()]
|
||||||
|
if proxy_endpoints:
|
||||||
|
logging.info("running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE)
|
||||||
|
except Exception as e:
|
||||||
|
proxy_endpoints = []
|
||||||
|
logging.info("running with empty proxy endpoints file")
|
||||||
|
return proxy_endpoints
|
||||||
|
|
||||||
rr = rethinker(args)
|
rr = rethinker(args)
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
service_registry = doublethink.ServiceRegistry(rr)
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
skip_av_seeds_from_file = get_skip_av_seeds()
|
skip_av_seeds_from_file = get_skip_av_seeds()
|
||||||
|
proxy_endpoints_from_file = get_proxy_endpoints()
|
||||||
worker = brozzler.worker.BrozzlerWorker(
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
frontier,
|
frontier,
|
||||||
service_registry,
|
service_registry,
|
||||||
skip_av_seeds=skip_av_seeds_from_file,
|
skip_av_seeds=skip_av_seeds_from_file,
|
||||||
|
proxy_endpoints=proxy_endpoints_from_file,
|
||||||
max_browsers=int(args.max_browsers),
|
max_browsers=int(args.max_browsers),
|
||||||
chrome_exe=args.chrome_exe,
|
chrome_exe=args.chrome_exe,
|
||||||
proxy=args.proxy,
|
proxy=args.proxy,
|
||||||
|
|
|
@ -57,6 +57,7 @@ class BrozzlerWorker:
|
||||||
frontier,
|
frontier,
|
||||||
service_registry=None,
|
service_registry=None,
|
||||||
skip_av_seeds=None,
|
skip_av_seeds=None,
|
||||||
|
proxy_endpoints=None,
|
||||||
max_browsers=1,
|
max_browsers=1,
|
||||||
chrome_exe="chromium-browser",
|
chrome_exe="chromium-browser",
|
||||||
warcprox_auto=False,
|
warcprox_auto=False,
|
||||||
|
@ -81,6 +82,7 @@ class BrozzlerWorker:
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._skip_av_seeds = skip_av_seeds
|
self._skip_av_seeds = skip_av_seeds
|
||||||
|
self._proxy_endpoints = proxy_endpoints
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
|
||||||
self._warcprox_auto = warcprox_auto
|
self._warcprox_auto = warcprox_auto
|
||||||
|
@ -287,7 +289,7 @@ class BrozzlerWorker:
|
||||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
|
|
||||||
if enable_youtube_dl and ydl.should_ytdlp(
|
if enable_youtube_dl and ydl.should_ytdlp(
|
||||||
site, page, status_code, self._skip_av_seeds
|
site, page, status_code, self._skip_av_seeds, self._proxy_endpoints
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
|
|
|
@ -34,12 +34,11 @@ import time
|
||||||
thread_local = threading.local()
|
thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
YTDLP_PROXY = ""
|
|
||||||
PROXY_ATTEMPTS = 4
|
PROXY_ATTEMPTS = 4
|
||||||
YTDLP_WAIT = 10
|
YTDLP_WAIT = 10
|
||||||
|
|
||||||
|
|
||||||
def should_ytdlp(site, page, page_status, skip_av_seeds):
|
def should_ytdlp(site, page, page_status, skip_av_seeds, proxy_endpoints):
|
||||||
# called only after we've passed needs_browsing() check
|
# called only after we've passed needs_browsing() check
|
||||||
|
|
||||||
if page_status != 200:
|
if page_status != 200:
|
||||||
|
@ -285,11 +284,11 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
|
|
||||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||||
is_youtube_host = isyoutubehost(ytdlp_url)
|
is_youtube_host = isyoutubehost(ytdlp_url)
|
||||||
if is_youtube_host and YTDLP_PROXY:
|
if is_youtube_host and proxy_endpoints:
|
||||||
ydl_opts["proxy"] = YTDLP_PROXY
|
ydl_opts["proxy"] = random.choice(proxy_endpoints)
|
||||||
# don't log proxy value secrets
|
# don't log proxy value secrets
|
||||||
ytdlp_proxy_for_logs = (
|
ytdlp_proxy_for_logs = (
|
||||||
YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@"
|
ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
|
||||||
)
|
)
|
||||||
logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)
|
logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue