From c74b1123bb84300b2a8af2652228b226837d73be Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 31 Aug 2023 18:02:01 -0700 Subject: [PATCH 1/5] update for mp4s like they used to be --- brozzler/ydl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 906c653..30ef982 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -250,7 +250,9 @@ def _build_youtube_dl(worker, destdir, site, page): # "aext: Audio Extension (m4a > aac > mp3 > ogg > opus > webm > other)." # "If --prefer-free-formats is used, the order changes to opus > ogg > webm > m4a > mp3 > aac." # "ext: Equivalent to vext,aext" - "format_sort": ["ext"], + # pre-v.2023.07.06: "format_sort": ["ext"], + # v.2023.07.06 https://www.reddit.com/r/youtubedl/wiki/h264/?rdt=63577 + "format_sort": ["vcodec:h264","res","acodec:m4a"], "format": "b/bv+ba", # skip live streams "match_filter": match_filter_func("!is_live"), From c5c918bc87d398aa2b83f90a6c7672ec77c3e77e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 5 Sep 2023 15:40:23 -0700 Subject: [PATCH 2/5] running well enough maybe --- brozzler/ydl.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 30ef982..07bed91 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -31,17 +31,6 @@ import threading thread_local = threading.local() -_orig_webpage_read_content = youtube_dl.extractor.GenericIE._webpage_read_content -def _webpage_read_content(self, *args, **kwargs): - content = _orig_webpage_read_content(self, *args, **kwargs) - if len(content) > 20000000: - logging.warning( - 'bypassing yt-dlp extraction because content is ' - 'too large (%s characters)', len(content)) - return '' - return content -youtube_dl.extractor.GenericIE._webpage_read_content = _webpage_read_content - class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers @@ -226,7 +215,8 @@ def _build_youtube_dl(worker, destdir, site, page): if d['status'] == 'finished': worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) - if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site): + # if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site): + if worker._using_warcprox(site): _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict']) # default socket_timeout is 20 -- we hit it often when cluster is busy @@ -251,21 +241,24 @@ def _build_youtube_dl(worker, destdir, site, page): # "If --prefer-free-formats is used, the order changes to opus > ogg > webm > m4a > mp3 > aac." # "ext: Equivalent to vext,aext" # pre-v.2023.07.06: "format_sort": ["ext"], + # pre-v.2023.07.06: "format": "b/bv+ba" # v.2023.07.06 https://www.reddit.com/r/youtubedl/wiki/h264/?rdt=63577 - "format_sort": ["vcodec:h264","res","acodec:m4a"], - "format": "b/bv+ba", + "format_sort": ["codec:h264"], # skip live streams "match_filter": match_filter_func("!is_live"), - # --cache-dir local or... - "cache_dir": False, + "extractor_args": {'youtube': {'skip': ['dash', 'hls']}}, + + # --cache-dir local or.. + # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal? + "cache_dir": "/home/archiveit", "logger": logging.getLogger("youtube_dl"), "verbose": True, "quiet": False, } - if worker._proxy_for(site): - ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) + #if worker._proxy_for(site): + # ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) From 7a3c6d6abe26d4fc4efde340ae0005605191fce8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 6 Sep 2023 17:30:48 -0700 Subject: [PATCH 3/5] set url per postprocessor --- brozzler/ydl.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 07bed91..669c0ba 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -153,8 +153,8 @@ def _build_youtube_dl(worker, destdir, site, page): self.logger.info( 'extractor %r found a download in %s', ie.IE_NAME, url) - def _push_stitched_up_vid_to_warcprox(self, site, info_dict): - # 220211 update: does yt-dlp supply content-type? + def _push_stitched_up_vid_to_warcprox(self, site, info_dict, postprocessor): + # 220211 update: does yt-dlp supply content-type? no, not as such # XXX Don't know how to get the right content-type. Youtube-dl # doesn't supply it. Sometimes (with --hls-prefer-native) # youtube-dl produces a stitched-up video that /usr/bin/file fails @@ -171,9 +171,14 @@ def _build_youtube_dl(worker, destdir, site, page): self.logger.warning( 'guessing mimetype %s because %r', mimetype, e) - url = 'youtube-dl:%05d:%s' % ( - info_dict.get('playlist_index') or 1, - info_dict['webpage_url']) + # watch page postprocessor is MoveFiles + if postprocessor == 'FixupM3u8': + url = 'youtube-dl:%05d:%s' % ( + info_dict.get('playlist_index') or 1, + info_dict['webpage_url']) + else: + url = info_dict.get('url') + size = os.path.getsize(info_dict['filepath']) self.logger.info( 'pushing %r video stitched-up as %s (%s bytes) to ' @@ -215,9 +220,10 @@ def _build_youtube_dl(worker, destdir, site, page): if d['status'] == 'finished': worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) + #worker.logger.info('[ydl_postprocess_hook] passed params: {}'.format(d)) # if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site): if worker._using_warcprox(site): - _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict']) + _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor']) # default socket_timeout is 20 -- we hit it often when cluster is busy ydl_opts = { From 9cf12039c9d6e86c65b8bfad698735fd642ee99e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 7 Sep 2023 12:01:16 -0700 Subject: [PATCH 4/5] skip remembering youtube video chunks --- brozzler/ydl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 669c0ba..05eb876 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -297,6 +297,9 @@ def _remember_videos(page, fetches, stitch_ups=None): video['content-length'] = int( fetch['response_headers']['content-length']) if 'content-range' in fetch['response_headers']: + # skip chunked youtube video + if 'googlevideo.com/videoplayback' in fetch['url']: + continue video['content-range'] = fetch[ 'response_headers']['content-range'] logging.debug('embedded video %s', video) From f868ce146bd97f91ab483940b18db19e097e9d0c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 7 Sep 2023 12:39:43 -0700 Subject: [PATCH 5/5] tidying --- brozzler/ydl.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 05eb876..422b72f 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,7 +1,7 @@ ''' brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler -Copyright (C) 2022 Internet Archive +Copyright (C) 2023 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -171,7 +171,7 @@ def _build_youtube_dl(worker, destdir, site, page): self.logger.warning( 'guessing mimetype %s because %r', mimetype, e) - # watch page postprocessor is MoveFiles + # youtube watch page postprocessor is MoveFiles if postprocessor == 'FixupM3u8': url = 'youtube-dl:%05d:%s' % ( info_dict.get('playlist_index') or 1, @@ -220,8 +220,6 @@ def _build_youtube_dl(worker, destdir, site, page): if d['status'] == 'finished': worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) - #worker.logger.info('[ydl_postprocess_hook] passed params: {}'.format(d)) - # if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site): if worker._using_warcprox(site): _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor']) @@ -263,8 +261,11 @@ def _build_youtube_dl(worker, destdir, site, page): "verbose": True, "quiet": False, } - #if worker._proxy_for(site): + + # skip proxying yt-dlp v.2023.07.06 + # if worker._proxy_for(site): # ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) + ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))