From 62b12434d0a0d626a756e9fa70a14d5a5f7cce52 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 18 Sep 2024 14:51:11 -0700 Subject: [PATCH] mostly ydl.py updates for new proxyrack testing --- brozzler/ydl.py | 82 +++++++++++++++++-------------------------------- 1 file changed, 28 insertions(+), 54 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 392bb2d..7368534 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,7 @@ import time thread_local = threading.local() PROXYRACK_PROXY = "@@@" -MAX_YTDLP_ATTEMPTS = 4 +MAX_YTDLP_ATTEMPTS = 3 YTDLP_WAIT = 10 @@ -197,15 +197,15 @@ def _build_youtube_dl(worker, destdir, site, page): payload=f, extra_headers=extra_headers, ) - # consulted by _remember_videos() - ydl.pushed_videos.append( - { - "url": url, - "response_code": response.code, - "content-type": mimetype, - "content-length": size, - } - ) + # consulted by _remember_videos() + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed @@ -268,6 +268,9 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, + # does this make sense when we're generally downloading one at a time? + "sleep_interval": 25, + "max_sleep_interval": 90, "proxy": PROXYRACK_PROXY, } @@ -308,7 +311,12 @@ def _try_youtube_dl(worker, ydl, site, page): while attempt < MAX_YTDLP_ATTEMPTS: try: logging.info("trying yt-dlp on %s", ytdlp_url) - metrics.brozzler_ydl_download_attempts.labels(youtube_host).inc(1) + # should_download_vid = not youtube_host + # then + # ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)), download=should_download_vid) + # if youtube_host and ie_result: + # download_url = ie_result.get("url") + metrics.brozzler_ydl_extract_attempts.labels(youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError @@ -317,7 +325,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - metrics.brozzler_ydl_download_successes.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -366,48 +374,14 @@ def _try_youtube_dl(worker, ydl, site, page): "with yt-dlp json for %s", ytdlp_url, ) - - attempt = 0 - while attempt < MAX_YTDLP_ATTEMPTS: - try: - worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), - warc_type="metadata", - content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers(page), - ) - break - except Exception as e: - # connection problem when using a proxy == proxy error - if ( - hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.URLError - and worker._proxy_for(site) - ): - attempt += 1 - if attempt == MAX_YTDLP_ATTEMPTS: - logging.warning( - "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e - ) - raise brozzler.ProxyError( - "yt-dlp hit proxy error storing media from %s with " - % ytdlp_url - ) - else: - logging.info( - "Attempt %s failed. Retrying in %s seconds...", - attempt, - YTDLP_WAIT, - ) - time.sleep(YTDLP_WAIT) - else: - raise - else: - raise brozzler.ProxyError( - "Proxy attempt(s) storing media failed for unknown reason(s)" - ) + worker._warcprox_write_record( + warcprox_address=worker._proxy_for(site), + url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + warc_type="metadata", + content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers(page), + ) return ie_result