mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
update for m3u8s, better naming
This commit is contained in:
parent
75e0555d43
commit
eef7173d72
@ -99,9 +99,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
|||||||
|
|
||||||
- keeps track of urls fetched using a `YoutubeDLSpy`
|
- keeps track of urls fetched using a `YoutubeDLSpy`
|
||||||
- periodically updates `site.last_claimed` in rethinkdb
|
- periodically updates `site.last_claimed` in rethinkdb
|
||||||
- if brozzling through warcprox and downloading segmented videos (e.g.
|
- pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request
|
||||||
HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
|
|
||||||
using a WARCPROX_WRITE_RECORD request
|
|
||||||
- some logging
|
- some logging
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -153,7 +151,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
|||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found a download in %s', ie.IE_NAME, url)
|
'extractor %r found a download in %s', ie.IE_NAME, url)
|
||||||
|
|
||||||
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, postprocessor):
|
def _push_video_to_warcprox(self, site, info_dict, postprocessor):
|
||||||
# 220211 update: does yt-dlp supply content-type? no, not as such
|
# 220211 update: does yt-dlp supply content-type? no, not as such
|
||||||
# XXX Don't know how to get the right content-type. Youtube-dl
|
# XXX Don't know how to get the right content-type. Youtube-dl
|
||||||
# doesn't supply it. Sometimes (with --hls-prefer-native)
|
# doesn't supply it. Sometimes (with --hls-prefer-native)
|
||||||
@ -172,20 +170,21 @@ def _build_youtube_dl(worker, destdir, site, page):
|
|||||||
'guessing mimetype %s because %r', mimetype, e)
|
'guessing mimetype %s because %r', mimetype, e)
|
||||||
|
|
||||||
# youtube watch page postprocessor is MoveFiles
|
# youtube watch page postprocessor is MoveFiles
|
||||||
# but current yt-dlp duplicates capture if we handle FixupM3u8!!!
|
|
||||||
# we'll ignore postprocessor for now...
|
|
||||||
'''
|
|
||||||
if postprocessor == 'FixupM3u8':
|
if postprocessor == 'FixupM3u8':
|
||||||
url = 'youtube-dl:%05d:%s' % (
|
url = 'youtube-dl:%05d:%s' % (
|
||||||
info_dict.get('playlist_index') or 1,
|
info_dict.get('playlist_index') or 1,
|
||||||
info_dict['webpage_url'])
|
info_dict['webpage_url'])
|
||||||
else:
|
else:
|
||||||
'''
|
url = info_dict.get('url')
|
||||||
url = info_dict.get('url')
|
|
||||||
|
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u*
|
||||||
|
if url.endswith('.m3u8'):
|
||||||
|
return
|
||||||
|
|
||||||
size = os.path.getsize(info_dict['filepath'])
|
size = os.path.getsize(info_dict['filepath'])
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'pushing %r video stitched-up as %s (%s bytes) to '
|
'pushing %r video as %s (%s bytes) to '
|
||||||
'warcprox at %s with url %s', info_dict['format'],
|
'warcprox at %s with url %s', info_dict['format'],
|
||||||
mimetype, size, worker._proxy_for(site), url)
|
mimetype, size, worker._proxy_for(site), url)
|
||||||
with open(info_dict['filepath'], 'rb') as f:
|
with open(info_dict['filepath'], 'rb') as f:
|
||||||
@ -198,7 +197,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
|||||||
warc_type='resource', content_type=mimetype, payload=f,
|
warc_type='resource', content_type=mimetype, payload=f,
|
||||||
extra_headers=extra_headers)
|
extra_headers=extra_headers)
|
||||||
# consulted by _remember_videos()
|
# consulted by _remember_videos()
|
||||||
ydl.stitch_ups.append({
|
ydl.pushed_videos.append({
|
||||||
'url': url,
|
'url': url,
|
||||||
'response_code': response.code,
|
'response_code': response.code,
|
||||||
'content-type': mimetype,
|
'content-type': mimetype,
|
||||||
@ -225,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page):
|
|||||||
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
|
worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
|
||||||
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
|
worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
|
||||||
if worker._using_warcprox(site):
|
if worker._using_warcprox(site):
|
||||||
_YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
|
_YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
|
||||||
|
|
||||||
# default socket_timeout is 20 -- we hit it often when cluster is busy
|
# default socket_timeout is 20 -- we hit it often when cluster is busy
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
@ -274,11 +273,11 @@ def _build_youtube_dl(worker, destdir, site, page):
|
|||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
|
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
|
||||||
ydl.fetch_spy = YoutubeDLSpy()
|
ydl.fetch_spy = YoutubeDLSpy()
|
||||||
ydl.stitch_ups = []
|
ydl.pushed_videos = []
|
||||||
ydl._opener.add_handler(ydl.fetch_spy)
|
ydl._opener.add_handler(ydl.fetch_spy)
|
||||||
return ydl
|
return ydl
|
||||||
|
|
||||||
def _remember_videos(page, fetches, stitch_ups=None):
|
def _remember_videos(page, fetches, pushed_videos=None):
|
||||||
'''
|
'''
|
||||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||||
'''
|
'''
|
||||||
@ -309,14 +308,14 @@ def _remember_videos(page, fetches, stitch_ups=None):
|
|||||||
'response_headers']['content-range']
|
'response_headers']['content-range']
|
||||||
logging.debug('embedded video %s', video)
|
logging.debug('embedded video %s', video)
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
for stitch_up in stitch_ups or []:
|
for pushed_video in pushed_videos or []:
|
||||||
if stitch_up['content-type'].startswith('video/'):
|
if pushed_video['content-type'].startswith('video/'):
|
||||||
video = {
|
video = {
|
||||||
'blame': 'youtube-dl',
|
'blame': 'youtube-dl',
|
||||||
'url': stitch_up['url'],
|
'url': pushed_video['url'],
|
||||||
'response_code': stitch_up['response_code'],
|
'response_code': pushed_video['response_code'],
|
||||||
'content-type': stitch_up['content-type'],
|
'content-type': pushed_video['content-type'],
|
||||||
'content-length': stitch_up['content-length'],
|
'content-length': pushed_video['content-length'],
|
||||||
}
|
}
|
||||||
logging.debug('embedded video %s', video)
|
logging.debug('embedded video %s', video)
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
@ -331,7 +330,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
|||||||
# needs automated test
|
# needs automated test
|
||||||
# and yt-dlp needs sanitize_info for extract_info
|
# and yt-dlp needs sanitize_info for extract_info
|
||||||
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
|
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
|
||||||
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
|
_remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
|
||||||
if worker._using_warcprox(site):
|
if worker._using_warcprox(site):
|
||||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||||
logging.info(
|
logging.info(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user