update for m3u8s, better naming

This commit is contained in:
Barbara Miller 2023-09-18 14:51:14 -07:00
parent 75e0555d43
commit eef7173d72

View File

@ -99,9 +99,7 @@ def _build_youtube_dl(worker, destdir, site, page):
- keeps track of urls fetched using a `YoutubeDLSpy`
- periodically updates `site.last_claimed` in rethinkdb
- if brozzling through warcprox and downloading segmented videos (e.g.
HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
using a WARCPROX_WRITE_RECORD request
- pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request
- some logging
Args:
@ -153,7 +151,7 @@ def _build_youtube_dl(worker, destdir, site, page):
self.logger.info(
'extractor %r found a download in %s', ie.IE_NAME, url)
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, postprocessor):
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
# 220211 update: does yt-dlp supply content-type? no, not as such
# XXX Don't know how to get the right content-type. Youtube-dl
# doesn't supply it. Sometimes (with --hls-prefer-native)
@ -172,20 +170,21 @@ def _build_youtube_dl(worker, destdir, site, page):
'guessing mimetype %s because %r', mimetype, e)
# youtube watch page postprocessor is MoveFiles
# but current yt-dlp duplicates capture if we handle FixupM3u8!!!
# we'll ignore postprocessor for now...
'''
if postprocessor == 'FixupM3u8':
url = 'youtube-dl:%05d:%s' % (
info_dict.get('playlist_index') or 1,
info_dict['webpage_url'])
else:
'''
url = info_dict.get('url')
url = info_dict.get('url')
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u*
if url.endswith('.m3u8'):
return
size = os.path.getsize(info_dict['filepath'])
self.logger.info(
'pushing %r video stitched-up as %s (%s bytes) to '
'pushing %r video as %s (%s bytes) to '
'warcprox at %s with url %s', info_dict['format'],
mimetype, size, worker._proxy_for(site), url)
with open(info_dict['filepath'], 'rb') as f:
@ -198,7 +197,7 @@ def _build_youtube_dl(worker, destdir, site, page):
warc_type='resource', content_type=mimetype, payload=f,
extra_headers=extra_headers)
# consulted by _remember_videos()
ydl.stitch_ups.append({
ydl.pushed_videos.append({
'url': url,
'response_code': response.code,
'content-type': mimetype,
@ -225,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page):
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
if worker._using_warcprox(site):
_YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
# default socket_timeout is 20 -- we hit it often when cluster is busy
ydl_opts = {
@ -274,11 +273,11 @@ def _build_youtube_dl(worker, destdir, site, page):
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
ydl.fetch_spy = YoutubeDLSpy()
ydl.stitch_ups = []
ydl.pushed_videos = []
ydl._opener.add_handler(ydl.fetch_spy)
return ydl
def _remember_videos(page, fetches, stitch_ups=None):
def _remember_videos(page, fetches, pushed_videos=None):
'''
Saves info about videos captured by yt-dlp in `page.videos`.
'''
@ -309,14 +308,14 @@ def _remember_videos(page, fetches, stitch_ups=None):
'response_headers']['content-range']
logging.debug('embedded video %s', video)
page.videos.append(video)
for stitch_up in stitch_ups or []:
if stitch_up['content-type'].startswith('video/'):
for pushed_video in pushed_videos or []:
if pushed_video['content-type'].startswith('video/'):
video = {
'blame': 'youtube-dl',
'url': stitch_up['url'],
'response_code': stitch_up['response_code'],
'content-type': stitch_up['content-type'],
'content-length': stitch_up['content-length'],
'url': pushed_video['url'],
'response_code': pushed_video['response_code'],
'content-type': pushed_video['content-type'],
'content-length': pushed_video['content-length'],
}
logging.debug('embedded video %s', video)
page.videos.append(video)
@ -331,7 +330,7 @@ def _try_youtube_dl(worker, ydl, site, page):
# needs automated test
# and yt-dlp needs sanitize_info for extract_info
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(