mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'bmiller/proxy_select' into qa
This commit is contained in:
commit
a65c3a369a
@ -129,9 +129,10 @@ def configure_logging(args):
|
||||
def suggest_default_chrome_exe():
|
||||
# mac os x application executable paths
|
||||
for path in [
|
||||
'/Applications/Thorium.app/Contents/MacOS/Thorium',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome']:
|
||||
"/Applications/Thorium.app/Contents/MacOS/Thorium",
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
@ -598,30 +599,31 @@ def brozzler_worker(argv=None):
|
||||
finally:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
def get_proxy_endpoints():
|
||||
PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt"
|
||||
def get_ytdlp_proxy_endpoints():
|
||||
YTDLP_PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/ytdlp_proxy_endpoints.txt"
|
||||
try:
|
||||
# make list from file
|
||||
with open(PROXY_ENDPOINTS_FILE) as endpoints:
|
||||
proxy_endpoints = [l for l in endpoints.readlines()]
|
||||
if proxy_endpoints:
|
||||
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
|
||||
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
|
||||
if ytdlp_proxy_endpoints:
|
||||
logging.info(
|
||||
"running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE
|
||||
"running with ytdlp proxy endpoints file %s"
|
||||
% YTDLP_PROXY_ENDPOINTS_FILE
|
||||
)
|
||||
except Exception as e:
|
||||
proxy_endpoints = []
|
||||
ytdlp_proxy_endpoints = []
|
||||
logging.info("running with empty proxy endpoints file")
|
||||
return proxy_endpoints
|
||||
return ytdlp_proxy_endpoints
|
||||
|
||||
rr = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
skip_av_seeds_from_file = get_skip_av_seeds()
|
||||
proxy_endpoints_from_file = get_proxy_endpoints()
|
||||
ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints()
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier,
|
||||
service_registry,
|
||||
proxy_endpoints=proxy_endpoints_from_file,
|
||||
ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file,
|
||||
max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe,
|
||||
proxy=args.proxy,
|
||||
|
@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
|
||||
scopes and adds outlinks to the frontier
|
||||
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
Copyright (C) 2014-2025 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -57,7 +57,7 @@ class BrozzlerWorker:
|
||||
self,
|
||||
frontier,
|
||||
service_registry=None,
|
||||
proxy_endpoints=None,
|
||||
ytdlp_proxy_endpoints=None,
|
||||
max_browsers=1,
|
||||
chrome_exe="chromium-browser",
|
||||
warcprox_auto=False,
|
||||
@ -81,7 +81,7 @@ class BrozzlerWorker:
|
||||
):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._proxy_endpoints = proxy_endpoints
|
||||
self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints
|
||||
self._max_browsers = max_browsers
|
||||
|
||||
self._warcprox_auto = warcprox_auto
|
||||
@ -298,11 +298,11 @@ class BrozzlerWorker:
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(
|
||||
site, page, status_code, self._proxy_endpoints
|
||||
):
|
||||
if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
|
||||
try:
|
||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
ydl_outlinks = ydl.do_youtube_dl(
|
||||
self, site, page, self._ytdlp_proxy_endpoints
|
||||
)
|
||||
metrics.brozzler_ydl_urls_checked.inc(1)
|
||||
outlinks.update(ydl_outlinks)
|
||||
except brozzler.ReachedLimit as e:
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""
|
||||
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
|
||||
|
||||
Copyright (C) 2024 Internet Archive
|
||||
Copyright (C) 2024-2025 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -30,6 +30,7 @@ import datetime
|
||||
|
||||
from . import metrics
|
||||
|
||||
import random
|
||||
import threading
|
||||
import traceback
|
||||
import doublethink
|
||||
@ -42,7 +43,8 @@ PROXY_ATTEMPTS = 4
|
||||
YTDLP_WAIT = 10
|
||||
YTDLP_MAX_REDIRECTS = 5
|
||||
|
||||
def should_ytdlp(site, page, page_status, proxy_endpoints):
|
||||
|
||||
def should_ytdlp(site, page, page_status):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page_status != 200:
|
||||
@ -62,6 +64,7 @@ def should_ytdlp(site, page, page_status, proxy_endpoints):
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def isyoutubehost(url):
|
||||
# split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname
|
||||
return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]
|
||||
@ -80,7 +83,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
return req
|
||||
|
||||
|
||||
def _build_youtube_dl(worker, destdir, site, page):
|
||||
def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
"""
|
||||
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
|
||||
|
||||
@ -297,8 +300,8 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
is_youtube_host = isyoutubehost(ytdlp_url)
|
||||
if is_youtube_host and proxy_endpoints:
|
||||
ydl_opts["proxy"] = random.choice(proxy_endpoints)
|
||||
if is_youtube_host and ytdlp_proxy_endpoints:
|
||||
ydl_opts["proxy"] = random.choice(ytdlp_proxy_endpoints)
|
||||
# don't log proxy value secrets
|
||||
ytdlp_proxy_for_logs = (
|
||||
ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
|
||||
@ -425,7 +428,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
|
||||
@metrics.brozzler_ytdlp_duration_seconds.time()
|
||||
@metrics.brozzler_in_progress_ytdlps.track_inprogress()
|
||||
def do_youtube_dl(worker, site, page):
|
||||
def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
|
||||
"""
|
||||
Runs yt-dlp configured for `worker` and `site` to download videos from
|
||||
`page`.
|
||||
@ -442,7 +445,7 @@ def do_youtube_dl(worker, site, page):
|
||||
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
|
||||
) as tempdir:
|
||||
logging.info("tempdir for yt-dlp: %s", tempdir)
|
||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||
ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints)
|
||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||
outlinks = set()
|
||||
if ie_result and (
|
||||
|
Loading…
x
Reference in New Issue
Block a user