From edf2f9e51a001fdcabe95a94f6a1e15cc096232f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 10 Sep 2024 21:58:47 -0700 Subject: [PATCH] retries for proxyrack and push --- brozzler/ydl.py | 157 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 108 insertions(+), 49 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b292129..092dab7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -28,9 +28,14 @@ import json import doublethink import datetime import threading +import time thread_local = threading.local() +PROXYRACK_PROXY = "@@@" +MAX_YTDLP_ATTEMPTS = 4 +YTDLP_WAIT = 10 + def should_ytdlp(site, page, page_status, skip_av_seeds): # called only after we've passed needs_browsing() check @@ -262,6 +267,7 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, + "proxy": PROXYRACK_PROXY, } # skip proxying yt-dlp v.2023.07.06 @@ -296,57 +302,110 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - try: - logging.info("trying yt-dlp on %s", ytdlp_url) + attempt = 0 + while attempt < MAX_YTDLP_ATTEMPTS: + try: + logging.info("trying yt-dlp on %s", ytdlp_url) - with brozzler.thread_accept_exceptions(): - # we do whatwg canonicalization here to avoid "" resulting in ProxyError - # needs automated test - # and yt-dlp needs sanitize_info for extract_info - ie_result = ydl.sanitize_info( - ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) - ) - _remember_videos(page, ydl.pushed_videos) - if worker._using_warcprox(site): - info_json = json.dumps(ie_result, sort_keys=True, indent=4) - logging.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " - "with yt-dlp json for %s", - ytdlp_url, - ) - worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), - warc_type="metadata", - content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers(page), - ) - return ie_result - except brozzler.ShutdownRequested as e: - raise - except Exception as e: - if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError: - return None - elif ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.HTTPError - and hasattr(e.exc_info[1], "code") - and e.exc_info[1].code == 420 - ): - raise brozzler.ReachedLimit(e.exc_info[1]) - elif ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.URLError - and worker._proxy_for(site) - ): - # connection problem when using a proxy == proxy error (XXX?) - raise brozzler.ProxyError( - "yt-dlp hit apparent proxy error from " "%s" % ytdlp_url - ) from e - else: + with brozzler.thread_accept_exceptions(): + # we do whatwg canonicalization here to avoid "" resulting in ProxyError + # needs automated test + # and yt-dlp needs sanitize_info for extract_info + ie_result = ydl.sanitize_info( + ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) + ) + break + except brozzler.ShutdownRequested as e: raise + except Exception as e: + if ( + hasattr(e, "exc_info") + and e.exc_info[0] == yt_dlp.utils.UnsupportedError + ): + return None + elif ( + hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.HTTPError + and hasattr(e.exc_info[1], "code") + and e.exc_info[1].code == 420 + ): + raise brozzler.ReachedLimit(e.exc_info[1]) + else: + # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) + # and others... + attempt += 1 + if attempt == MAX_YTDLP_ATTEMPTS: + logging.warning( + "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e + ) + raise brozzler.ProxyError( + "yt-dlp hit proxyrack proxy error from %s" % ytdlp_url + ) + else: + logging.info( + "Attempt %s failed. Retrying in %s seconds...", + attempt, + YTDLP_WAIT, + ) + time.sleep(YTDLP_WAIT) + else: + raise brozzler.ProxyError( + "Proxyrack proxy attempt(s) failed for unknown reason(s)" + ) + logging.info("ytdlp completed successfully") + + _remember_videos(page, ydl.pushed_videos) + if worker._using_warcprox(site): + info_json = json.dumps(ie_result, sort_keys=True, indent=4) + logging.info( + "sending WARCPROX_WRITE_RECORD request to warcprox " + "with yt-dlp json for %s", + ytdlp_url, + ) + + attempt = 0 + while attempt < MAX_YTDLP_ATTEMPTS: + try: + worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + warc_type="metadata", + content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers(page), + ) + break + except Exception as e: + # connection problem when using a proxy == proxy error + if ( + hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.URLError + and worker._proxy_for(site) + ): + attempt += 1 + if attempt == MAX_YTDLP_ATTEMPTS: + logging.warning( + "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e + ) + raise brozzler.ProxyError( + "yt-dlp hit proxy error storing media from %s with " + % ytdlp_url + ) + else: + logging.info( + "Attempt %s failed. Retrying in %s seconds...", + attempt, + YTDLP_WAIT, + ) + time.sleep(YTDLP_WAIT) + else: + raise + else: + raise brozzler.ProxyError( + "Proxy attempt(s) storing media failed for unknown reason(s)" + ) + return ie_result def do_youtube_dl(worker, site, page):