diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 49e9863..767933a 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -38,7 +38,7 @@ import time thread_local = threading.local() -PROXYRACK_PROXY = "@@@" +PROXYRACK_PROXY = "" MAX_YTDLP_ATTEMPTS = 4 YTDLP_WAIT = 10 @@ -318,13 +318,19 @@ def _build_youtube_dl(worker, destdir, site, page): "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, - # does this make sense when we're generally downloading one at a time? + # recommended to avoid bot detection "sleep_interval": 25, "max_sleep_interval": 90, - "proxy": PROXYRACK_PROXY, } - # skip proxying yt-dlp v.2023.07.06 + ytdlp_url = page.redirect_url if page.redirect_url else page.url + youtube_host = ( + "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] + ) + if youtube_host: + ydl_opts["proxy"] = PROXYRACK_PROXY + + # skip warcprox proxying yt-dlp v.2023.07.06 # if worker._proxy_for(site): # ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) @@ -332,6 +338,9 @@ def _build_youtube_dl(worker, destdir, site, page): if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.pushed_videos = [] + ydl.url = ytdlp_url + ydl.youtube_host = youtube_host + return ydl @@ -354,29 +363,25 @@ def _remember_videos(page, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): - ytdlp_url = page.redirect_url if page.redirect_url else page.url - youtube_host = ( - "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0] - ) attempt = 0 while attempt < MAX_YTDLP_ATTEMPTS: try: - logging.info("trying yt-dlp on %s", ytdlp_url) - # should_download_vid = not youtube_host + logging.info("trying yt-dlp on %s", ydl.url) + # should_download_vid = not ydl.youtube_host # then - # ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)), download=should_download_vid) - # if youtube_host and ie_result: + # ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid) + # if ydl.youtube_host and ie_result: # download_url = ie_result.get("url") - metrics.brozzler_ydl_extract_attempts.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_attempts.labels(ydl.youtube_host).inc(1) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info( - ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) + ydl.extract_info(str(urlcanon.whatwg(ydl.url))) ) - metrics.brozzler_ydl_extract_successes.labels(youtube_host).inc(1) + metrics.brozzler_ydl_extract_successes.labels(ydl.youtube_host).inc(1) break except brozzler.ShutdownRequested as e: raise @@ -394,6 +399,7 @@ def _try_youtube_dl(worker, ydl, site, page): ): raise brozzler.ReachedLimit(e.exc_info[1]) else: + # todo: other errors to handle separately? # OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...) # and others... attempt += 1 @@ -402,7 +408,7 @@ def _try_youtube_dl(worker, ydl, site, page): "Failed after %s attempts. Error: %s", MAX_YTDLP_ATTEMPTS, e ) raise brozzler.ProxyError( - "yt-dlp hit proxyrack proxy error from %s" % ytdlp_url + "yt-dlp hit possible proxyrack proxy error from %s" % ydl.url ) else: logging.info( @@ -423,11 +429,11 @@ def _try_youtube_dl(worker, ydl, site, page): logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", - ytdlp_url, + ydl.url, ) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)), + url="youtube-dl:%s" % str(urlcanon.semantic(ydl.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), diff --git a/setup.py b/setup.py index c836ff1..eb3a9c9 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.55a2", + version="1.5.55a3", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt",