mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 16:19:49 -05:00
update for m3u8s, better naming
This commit is contained in:
parent
75e0555d43
commit
eef7173d72
@ -99,9 +99,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
|
||||
- keeps track of urls fetched using a `YoutubeDLSpy`
|
||||
- periodically updates `site.last_claimed` in rethinkdb
|
||||
- if brozzling through warcprox and downloading segmented videos (e.g.
|
||||
HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
|
||||
using a WARCPROX_WRITE_RECORD request
|
||||
- pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request
|
||||
- some logging
|
||||
|
||||
Args:
|
||||
@ -153,7 +151,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
self.logger.info(
|
||||
'extractor %r found a download in %s', ie.IE_NAME, url)
|
||||
|
||||
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, postprocessor):
|
||||
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
|
||||
# 220211 update: does yt-dlp supply content-type? no, not as such
|
||||
# XXX Don't know how to get the right content-type. Youtube-dl
|
||||
# doesn't supply it. Sometimes (with --hls-prefer-native)
|
||||
@ -172,20 +170,21 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
'guessing mimetype %s because %r', mimetype, e)
|
||||
|
||||
# youtube watch page postprocessor is MoveFiles
|
||||
# but current yt-dlp duplicates capture if we handle FixupM3u8!!!
|
||||
# we'll ignore postprocessor for now...
|
||||
'''
|
||||
|
||||
if postprocessor == 'FixupM3u8':
|
||||
url = 'youtube-dl:%05d:%s' % (
|
||||
info_dict.get('playlist_index') or 1,
|
||||
info_dict['webpage_url'])
|
||||
else:
|
||||
'''
|
||||
url = info_dict.get('url')
|
||||
url = info_dict.get('url')
|
||||
|
||||
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u*
|
||||
if url.endswith('.m3u8'):
|
||||
return
|
||||
|
||||
size = os.path.getsize(info_dict['filepath'])
|
||||
self.logger.info(
|
||||
'pushing %r video stitched-up as %s (%s bytes) to '
|
||||
'pushing %r video as %s (%s bytes) to '
|
||||
'warcprox at %s with url %s', info_dict['format'],
|
||||
mimetype, size, worker._proxy_for(site), url)
|
||||
with open(info_dict['filepath'], 'rb') as f:
|
||||
@ -198,7 +197,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
warc_type='resource', content_type=mimetype, payload=f,
|
||||
extra_headers=extra_headers)
|
||||
# consulted by _remember_videos()
|
||||
ydl.stitch_ups.append({
|
||||
ydl.pushed_videos.append({
|
||||
'url': url,
|
||||
'response_code': response.code,
|
||||
'content-type': mimetype,
|
||||
@ -225,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
|
||||
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
|
||||
if worker._using_warcprox(site):
|
||||
_YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
|
||||
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
|
||||
|
||||
# default socket_timeout is 20 -- we hit it often when cluster is busy
|
||||
ydl_opts = {
|
||||
@ -274,11 +273,11 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
if site.extra_headers():
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
|
||||
ydl.fetch_spy = YoutubeDLSpy()
|
||||
ydl.stitch_ups = []
|
||||
ydl.pushed_videos = []
|
||||
ydl._opener.add_handler(ydl.fetch_spy)
|
||||
return ydl
|
||||
|
||||
def _remember_videos(page, fetches, stitch_ups=None):
|
||||
def _remember_videos(page, fetches, pushed_videos=None):
|
||||
'''
|
||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||
'''
|
||||
@ -309,14 +308,14 @@ def _remember_videos(page, fetches, stitch_ups=None):
|
||||
'response_headers']['content-range']
|
||||
logging.debug('embedded video %s', video)
|
||||
page.videos.append(video)
|
||||
for stitch_up in stitch_ups or []:
|
||||
if stitch_up['content-type'].startswith('video/'):
|
||||
for pushed_video in pushed_videos or []:
|
||||
if pushed_video['content-type'].startswith('video/'):
|
||||
video = {
|
||||
'blame': 'youtube-dl',
|
||||
'url': stitch_up['url'],
|
||||
'response_code': stitch_up['response_code'],
|
||||
'content-type': stitch_up['content-type'],
|
||||
'content-length': stitch_up['content-length'],
|
||||
'url': pushed_video['url'],
|
||||
'response_code': pushed_video['response_code'],
|
||||
'content-type': pushed_video['content-type'],
|
||||
'content-length': pushed_video['content-length'],
|
||||
}
|
||||
logging.debug('embedded video %s', video)
|
||||
page.videos.append(video)
|
||||
@ -331,7 +330,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
# needs automated test
|
||||
# and yt-dlp needs sanitize_info for extract_info
|
||||
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
|
||||
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
|
||||
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
|
||||
if worker._using_warcprox(site):
|
||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||
logging.info(
|
||||
|
Loading…
x
Reference in New Issue
Block a user