update for m3u8s, better naming

This commit is contained in:
Barbara Miller 2023-09-18 14:51:14 -07:00
parent 75e0555d43
commit eef7173d72

View File

@ -99,9 +99,7 @@ def _build_youtube_dl(worker, destdir, site, page):
- keeps track of urls fetched using a `YoutubeDLSpy` - keeps track of urls fetched using a `YoutubeDLSpy`
- periodically updates `site.last_claimed` in rethinkdb - periodically updates `site.last_claimed` in rethinkdb
- if brozzling through warcprox and downloading segmented videos (e.g. - pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request
HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
using a WARCPROX_WRITE_RECORD request
- some logging - some logging
Args: Args:
@ -153,7 +151,7 @@ def _build_youtube_dl(worker, destdir, site, page):
self.logger.info( self.logger.info(
'extractor %r found a download in %s', ie.IE_NAME, url) 'extractor %r found a download in %s', ie.IE_NAME, url)
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, postprocessor): def _push_video_to_warcprox(self, site, info_dict, postprocessor):
# 220211 update: does yt-dlp supply content-type? no, not as such # 220211 update: does yt-dlp supply content-type? no, not as such
# XXX Don't know how to get the right content-type. Youtube-dl # XXX Don't know how to get the right content-type. Youtube-dl
# doesn't supply it. Sometimes (with --hls-prefer-native) # doesn't supply it. Sometimes (with --hls-prefer-native)
@ -172,20 +170,21 @@ def _build_youtube_dl(worker, destdir, site, page):
'guessing mimetype %s because %r', mimetype, e) 'guessing mimetype %s because %r', mimetype, e)
# youtube watch page postprocessor is MoveFiles # youtube watch page postprocessor is MoveFiles
# but current yt-dlp duplicates capture if we handle FixupM3u8!!!
# we'll ignore postprocessor for now...
'''
if postprocessor == 'FixupM3u8': if postprocessor == 'FixupM3u8':
url = 'youtube-dl:%05d:%s' % ( url = 'youtube-dl:%05d:%s' % (
info_dict.get('playlist_index') or 1, info_dict.get('playlist_index') or 1,
info_dict['webpage_url']) info_dict['webpage_url'])
else: else:
''' url = info_dict.get('url')
url = info_dict.get('url')
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u*
if url.endswith('.m3u8'):
return
size = os.path.getsize(info_dict['filepath']) size = os.path.getsize(info_dict['filepath'])
self.logger.info( self.logger.info(
'pushing %r video stitched-up as %s (%s bytes) to ' 'pushing %r video as %s (%s bytes) to '
'warcprox at %s with url %s', info_dict['format'], 'warcprox at %s with url %s', info_dict['format'],
mimetype, size, worker._proxy_for(site), url) mimetype, size, worker._proxy_for(site), url)
with open(info_dict['filepath'], 'rb') as f: with open(info_dict['filepath'], 'rb') as f:
@ -198,7 +197,7 @@ def _build_youtube_dl(worker, destdir, site, page):
warc_type='resource', content_type=mimetype, payload=f, warc_type='resource', content_type=mimetype, payload=f,
extra_headers=extra_headers) extra_headers=extra_headers)
# consulted by _remember_videos() # consulted by _remember_videos()
ydl.stitch_ups.append({ ydl.pushed_videos.append({
'url': url, 'url': url,
'response_code': response.code, 'response_code': response.code,
'content-type': mimetype, 'content-type': mimetype,
@ -225,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page):
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
if worker._using_warcprox(site): if worker._using_warcprox(site):
_YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor']) _YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
# default socket_timeout is 20 -- we hit it often when cluster is busy # default socket_timeout is 20 -- we hit it often when cluster is busy
ydl_opts = { ydl_opts = {
@ -274,11 +273,11 @@ def _build_youtube_dl(worker, destdir, site, page):
if site.extra_headers(): if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
ydl.fetch_spy = YoutubeDLSpy() ydl.fetch_spy = YoutubeDLSpy()
ydl.stitch_ups = [] ydl.pushed_videos = []
ydl._opener.add_handler(ydl.fetch_spy) ydl._opener.add_handler(ydl.fetch_spy)
return ydl return ydl
def _remember_videos(page, fetches, stitch_ups=None): def _remember_videos(page, fetches, pushed_videos=None):
''' '''
Saves info about videos captured by yt-dlp in `page.videos`. Saves info about videos captured by yt-dlp in `page.videos`.
''' '''
@ -309,14 +308,14 @@ def _remember_videos(page, fetches, stitch_ups=None):
'response_headers']['content-range'] 'response_headers']['content-range']
logging.debug('embedded video %s', video) logging.debug('embedded video %s', video)
page.videos.append(video) page.videos.append(video)
for stitch_up in stitch_ups or []: for pushed_video in pushed_videos or []:
if stitch_up['content-type'].startswith('video/'): if pushed_video['content-type'].startswith('video/'):
video = { video = {
'blame': 'youtube-dl', 'blame': 'youtube-dl',
'url': stitch_up['url'], 'url': pushed_video['url'],
'response_code': stitch_up['response_code'], 'response_code': pushed_video['response_code'],
'content-type': stitch_up['content-type'], 'content-type': pushed_video['content-type'],
'content-length': stitch_up['content-length'], 'content-length': pushed_video['content-length'],
} }
logging.debug('embedded video %s', video) logging.debug('embedded video %s', video)
page.videos.append(video) page.videos.append(video)
@ -331,7 +330,7 @@ def _try_youtube_dl(worker, ydl, site, page):
# needs automated test # needs automated test
# and yt-dlp needs sanitize_info for extract_info # and yt-dlp needs sanitize_info for extract_info
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
if worker._using_warcprox(site): if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4) info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info( logging.info(