mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'metrics_plus_proxy_retries' into qa
This commit is contained in:
commit
04aaec0ac5
@ -27,8 +27,8 @@ brozzler_resources_fetched = Counter("brozzler_resources_fetched", "number of re
|
||||
brozzler_resources_size_total = Counter("brozzler_resources_size_total", "total size of resources fetched", labelnames=["resource_type"])
|
||||
brozzler_resources_fetch_time = Counter("brozzler_resources_fetch_time", "time spent fetching resources", labelnames=["resource_type"])
|
||||
brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp")
|
||||
brozzler_ydl_extract_attempts = Counter("brozzler_ydl_download_attempts", "count of download attempted by brozzler yt-dlp", labelnames=["youtube_host"])
|
||||
brozzler_ydl_extract_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"])
|
||||
brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"])
|
||||
brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"])
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
@ -39,7 +39,7 @@ import time
|
||||
thread_local = threading.local()
|
||||
|
||||
PROXYRACK_PROXY = "@@@"
|
||||
MAX_YTDLP_ATTEMPTS = 3
|
||||
MAX_YTDLP_ATTEMPTS = 4
|
||||
YTDLP_WAIT = 10
|
||||
|
||||
|
||||
@ -231,12 +231,12 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
worker._proxy_for(site),
|
||||
url,
|
||||
)
|
||||
with open(info_dict["filepath"], "rb") as f:
|
||||
# include content-length header to avoid chunked
|
||||
# transfer, which warcprox currently rejects
|
||||
extra_headers = dict(site.extra_headers())
|
||||
extra_headers["content-length"] = size
|
||||
try:
|
||||
try:
|
||||
with open(info_dict["filepath"], "rb") as f:
|
||||
# include content-length header to avoid chunked
|
||||
# transfer, which warcprox currently rejects
|
||||
extra_headers = dict(site.extra_headers())
|
||||
extra_headers["content-length"] = size
|
||||
request, response = worker._warcprox_write_record(
|
||||
warcprox_address=worker._proxy_for(site),
|
||||
url=url,
|
||||
@ -245,17 +245,17 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
payload=f,
|
||||
extra_headers=extra_headers,
|
||||
)
|
||||
# consulted by _remember_videos()
|
||||
ydl.pushed_videos.append(
|
||||
{
|
||||
"url": url,
|
||||
"response_code": response.code,
|
||||
"content-type": mimetype,
|
||||
"content-length": size,
|
||||
}
|
||||
)
|
||||
except:
|
||||
traceback.print_exc()
|
||||
# consulted by _remember_videos()
|
||||
ydl.pushed_videos.append(
|
||||
{
|
||||
"url": url,
|
||||
"response_code": response.code,
|
||||
"content-type": mimetype,
|
||||
"content-length": size,
|
||||
}
|
||||
)
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
||||
# in case yt-dlp takes a long time, heartbeat site.last_claimed
|
||||
@ -355,7 +355,9 @@ def _remember_videos(page, pushed_videos=None):
|
||||
|
||||
def _try_youtube_dl(worker, ydl, site, page):
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
youtube_host = "youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0]
|
||||
youtube_host = (
|
||||
"youtube.com" in ytdlp_url.split("//")[-1].split("/")[0].split("?")[0]
|
||||
)
|
||||
attempt = 0
|
||||
while attempt < MAX_YTDLP_ATTEMPTS:
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user