convert ydl module logger

This commit is contained in:
Misty De Méo 2025-02-14 10:59:29 -08:00
parent 715a1471c0
commit 9bf9cf1382

View File

@ -16,7 +16,6 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import yt_dlp
from yt_dlp.utils import match_filter_func, ExtractorError
import brozzler
@ -29,6 +28,7 @@ import doublethink
import datetime
from . import metrics
import random
import structlog
import threading
import time
@ -40,14 +40,16 @@ YTDLP_WAIT = 10
YTDLP_MAX_REDIRECTS = 5
logger = structlog.get_logger()
def should_ytdlp(site, page, page_status, skip_av_seeds):
# called only after we've passed needs_browsing() check
if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdlp")
logger.info("skipping ytdlp: site marked skip_ytdlp")
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
@ -64,7 +66,7 @@ def should_ytdlp(site, page, page_status, skip_av_seeds):
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
logger.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
@ -112,7 +114,13 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
"""
class _YoutubeDL(yt_dlp.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
logger = structlog.get_logger(__module__ + "." + __qualname__)
def __init__(self, url, params=None, auto_init=True):
super().__init__(params, auto_init)
self.url = url
self.logger = self.logger.bind(url=url)
def process_ie_result(self, ie_result, download=True, extra_info=None):
if extra_info is None:
@ -122,7 +130,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
if result_type in ("url", "url_transparent"):
if "extraction_depth" in extra_info:
self.logger.info(
f"Following redirect URL: {ie_result['url']} extraction_depth: {extra_info['extraction_depth']}"
f"Following redirect", redirect_url=ie_result['url'], extraction_depth=extra_info['extraction_depth']
)
extra_info["extraction_depth"] = 1 + extra_info.get(
"extraction_depth", 0
@ -139,8 +147,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
super().add_default_extra_info(ie_result, ie, url)
extract_context = self.logger.bind(extractor=ie.IE_NAME)
if ie_result.get("_type") == "playlist":
self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url)
extract_context.info("found playlist")
if ie.IE_NAME in {
"youtube:playlist",
"youtube:tab",
@ -155,22 +164,20 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
try:
ie_result["entries_no_dl"] = list(ie_result["entries"])
except Exception as e:
self.logger.warning(
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
ie.IE_NAME,
url,
e,
extract_context.warning(
"failed to unroll entries ie_result['entries']?",
exc_info=True
)
ie_result["entries_no_dl"] = []
ie_result["entries"] = []
self.logger.info(
"not downloading %s media files from this "
"not downloading media files from this "
"playlist because we expect to capture them from "
"individual watch/track/detail pages",
len(ie_result["entries_no_dl"]),
media_file_count=len(ie_result["entries_no_dl"]),
)
else:
self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url)
extract_context.info("found a download")
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
# 220211 update: does yt-dlp supply content-type? no, not as such
@ -188,7 +195,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e:
mimetype = "video/%s" % info_dict["ext"]
self.logger.warning("guessing mimetype %s because %r", mimetype, e)
self.logger.warning("guessing mimetype due to error", mimetype=mimetype, exc_info=True)
# youtube watch page postprocessor is MoveFiles
@ -206,12 +213,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
size = os.path.getsize(info_dict["filepath"])
self.logger.info(
"pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s",
info_dict["format"],
mimetype,
size,
worker._proxy_for(site),
url,
"pushing video to warcprox",
format=info_dict["format"],
mimetype=mimetype,
size=size,
warcprox=worker._proxy_for(site)
)
with open(info_dict["filepath"], "rb") as f:
# include content-length header to avoid chunked
@ -248,24 +254,21 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
):
worker.logger.debug(
"heartbeating site.last_claimed to prevent another "
"brozzler-worker claiming this site id=%r",
site.id,
"brozzler-worker claiming this site",
id=site.id,
)
site.last_claimed = doublethink.utcnow()
site.save()
except:
worker.logger.debug(
"problem heartbeating site.last_claimed site id=%r",
site.id,
"problem heartbeating site.last_claimed site",
id=site.id,
exc_info=True,
)
def ydl_postprocess_hook(d):
if d["status"] == "finished":
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing")
worker.logger.info(
"[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"])
)
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing", postprocessor=d["postprocessor"])
is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"])
metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1)
@ -298,7 +301,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
# --cache-dir local or..
# this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal?
"cache_dir": "/home/archiveit",
"logger": logging.getLogger("yt_dlp"),
"logger": logger,
"verbose": False,
"quiet": False,
# recommended to avoid bot detection
@ -314,17 +317,16 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
ytdlp_proxy_for_logs = (
ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
)
logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)
logger.info("using yt-dlp proxy ...", proxy=ytdlp_proxy_for_logs)
# skip warcprox proxying yt-dlp v.2023.07.06: youtube extractor using ranges
# if worker._proxy_for(site):
# ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
ydl = _YoutubeDL(ydl_opts)
ydl = _YoutubeDL(ytdlp_url, params=ydl_opts)
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
ydl.pushed_videos = []
ydl.url = ytdlp_url
ydl.is_youtube_host = is_youtube_host
return ydl
@ -344,7 +346,7 @@ def _remember_videos(page, pushed_videos=None):
"content-type": pushed_video["content-type"],
"content-length": pushed_video["content-length"],
}
logging.debug("embedded video %s", video)
logger.debug("embedded video", video=video)
page.videos.append(video)
@ -353,7 +355,7 @@ def _try_youtube_dl(worker, ydl, site, page):
attempt = 0
while attempt < max_attempts:
try:
logging.info("trying yt-dlp on %s", ydl.url)
logger.info("trying yt-dlp", url=ydl.url)
# should_download_vid = not ydl.is_youtube_host
# then
# ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid)
@ -394,15 +396,15 @@ def _try_youtube_dl(worker, ydl, site, page):
# and others...
attempt += 1
if attempt == max_attempts:
logging.warning(
"Failed after %s attempt(s). Error: %s", max_attempts, e
logger.warning(
"Failed after %s attempt(s)", max_attempts, attempts=max_attempts, exc_info=True
)
raise brozzler.VideoExtractorError(
"yt-dlp hit error extracting info for %s" % ydl.url
)
else:
retry_wait = min(60, YTDLP_WAIT * (1.5 ** (attempt - 1)))
logging.info(
logger.info(
"Attempt %s failed. Retrying in %s seconds...",
attempt,
retry_wait,
@ -413,15 +415,15 @@ def _try_youtube_dl(worker, ydl, site, page):
"yt-dlp hit unknown error extracting info for %s" % ydl.url
)
logging.info("ytdlp completed successfully")
logger.info("ytdlp completed successfully")
_remember_videos(page, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json for %s",
ydl.url,
"with yt-dlp json",
url=ydl.url,
)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
@ -452,7 +454,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
with tempfile.TemporaryDirectory(
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
) as tempdir:
logging.info("tempdir for yt-dlp: %s", tempdir)
logger.info("tempdir for yt-dlp", tempdir=tempdir)
ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints)
ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set()