Merge branch 'proxyrack' into qa

This commit is contained in:
Barbara Miller 2024-09-11 09:54:17 -07:00
commit 39c677007e

View File

@ -32,9 +32,14 @@ from cassandra.cluster import Cluster
import threading
import traceback
import doublethink
import time
thread_local = threading.local()
PROXYRACK_PROXY = "@@@"
MAX_YTDLP_ATTEMPTS = 4
YTDLP_WAIT = 10
def _timestamp4datetime(timestamp):
"""split `timestamp` into a tuple of 6 integers.
@ -311,6 +316,7 @@ def _build_youtube_dl(worker, destdir, site, page):
"logger": logging.getLogger("yt_dlp"),
"verbose": False,
"quiet": False,
"proxy": PROXYRACK_PROXY,
}
# skip proxying yt-dlp v.2023.07.06
@ -344,6 +350,8 @@ def _remember_videos(page, pushed_videos=None):
def _try_youtube_dl(worker, ydl, site, page):
ytdlp_url = page.redirect_url if page.redirect_url else page.url
attempt = 0
while attempt < MAX_YTDLP_ATTEMPTS:
try:
logging.info("trying yt-dlp on %s", ytdlp_url)
@ -355,27 +363,14 @@ def _try_youtube_dl(worker, ydl, site, page):
ie_result = ydl.sanitize_info(
ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)))
)
_remember_videos(page, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s",
ytdlp_url,
)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers(page),
)
return ie_result
break
except brozzler.ShutdownRequested as e:
raise
except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
if (
hasattr(e, "exc_info")
and e.exc_info[0] == yt_dlp.utils.UnsupportedError
):
return None
elif (
hasattr(e, "exc_info")
@ -384,17 +379,81 @@ def _try_youtube_dl(worker, ydl, site, page):
and e.exc_info[1].code == 420
):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (
else:
# OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...)
# and others...
attempt += 1
if attempt == MAX_YTDLP_ATTEMPTS:
logging.warning(
"Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e
)
raise brozzler.ProxyError(
"yt-dlp hit proxyrack proxy error from %s" % ytdlp_url
)
else:
logging.info(
"Attempt %s failed. Retrying in %s seconds...",
attempt,
YTDLP_WAIT,
)
time.sleep(YTDLP_WAIT)
else:
raise brozzler.ProxyError(
"Proxyrack proxy attempt(s) failed for unknown reason(s)"
)
logging.info("ytdlp completed successfully")
_remember_videos(page, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s",
ytdlp_url,
)
attempt = 0
while attempt < MAX_YTDLP_ATTEMPTS:
try:
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers(page),
)
break
except Exception as e:
# connection problem when using a proxy == proxy error
if (
hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.URLError
and worker._proxy_for(site)
):
# connection problem when using a proxy == proxy error (XXX?)
attempt += 1
if attempt == MAX_YTDLP_ATTEMPTS:
logging.warning(
"Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e
)
raise brozzler.ProxyError(
"yt-dlp hit apparent proxy error from " "%s" % ytdlp_url
) from e
"yt-dlp hit proxy error storing media from %s with "
% ytdlp_url
)
else:
logging.info(
"Attempt %s failed. Retrying in %s seconds...",
attempt,
YTDLP_WAIT,
)
time.sleep(YTDLP_WAIT)
else:
raise
else:
raise brozzler.ProxyError(
"Proxy attempt(s) storing media failed for unknown reason(s)"
)
return ie_result
def do_youtube_dl(worker, site, page):