mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
brozzler/ydl.py updates
This commit is contained in:
parent
0305db5e69
commit
25bb65a635
104
brozzler/ydl.py
104
brozzler/ydl.py
@ -1,7 +1,7 @@
|
||||
'''
|
||||
brozzler/ydl.py - youtube-dl support for brozzler
|
||||
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
|
||||
|
||||
Copyright (C) 2018 Internet Archive
|
||||
Copyright (C) 2022 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -17,7 +17,7 @@ limitations under the License.
|
||||
'''
|
||||
|
||||
import logging
|
||||
import youtube_dl
|
||||
import yt_dlp as youtube_dl
|
||||
import brozzler
|
||||
import urllib.request
|
||||
import tempfile
|
||||
@ -29,31 +29,17 @@ import datetime
|
||||
import threading
|
||||
|
||||
thread_local = threading.local()
|
||||
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
|
||||
def _finish_frag_download(ffd_self, ctx):
|
||||
'''
|
||||
We monkey-patch this youtube-dl internal method `_finish_frag_download()`
|
||||
because it gets called after downloading the last segment of a segmented
|
||||
video, which is a good time to upload the stitched-up video that youtube-dl
|
||||
creates for us to warcprox. We have it call a thread-local callback
|
||||
since different threads may be youtube-dl'ing at the same time.
|
||||
'''
|
||||
result = _orig__finish_frag_download(ffd_self, ctx)
|
||||
if hasattr(thread_local, 'finish_frag_download_callback'):
|
||||
thread_local.finish_frag_download_callback(ffd_self, ctx)
|
||||
return result
|
||||
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
|
||||
|
||||
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
|
||||
_orig_webpage_read_content = youtube_dl.extractor.GenericIE._webpage_read_content
|
||||
def _webpage_read_content(self, *args, **kwargs):
|
||||
content = _orig_webpage_read_content(self, *args, **kwargs)
|
||||
if len(content) > 20000000:
|
||||
logging.warning(
|
||||
'bypassing youtube-dl extraction because content is '
|
||||
'bypassing yt-dlp extraction because content is '
|
||||
'too large (%s characters)', len(content))
|
||||
return ''
|
||||
return content
|
||||
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
|
||||
youtube_dl.extractor.GenericIE._webpage_read_content = _webpage_read_content
|
||||
|
||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
def __init__(self, extra_headers):
|
||||
@ -117,14 +103,14 @@ def final_bounces(fetches, url):
|
||||
|
||||
def _build_youtube_dl(worker, destdir, site):
|
||||
'''
|
||||
Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
|
||||
Builds a yt-dlp `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
|
||||
|
||||
The `YoutubeDL` instance does a few special brozzler-specific things:
|
||||
|
||||
- keeps track of urls fetched using a `YoutubeDLSpy`
|
||||
- periodically updates `site.last_claimed` in rethinkdb
|
||||
- if brozzling through warcprox and downloading segmented videos (e.g.
|
||||
HLS), pushes the stitched-up video created by youtube-dl to warcprox
|
||||
HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
|
||||
using a WARCPROX_WRITE_RECORD request
|
||||
- some logging
|
||||
|
||||
@ -134,7 +120,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
site (brozzler.Site): the site we are brozzling
|
||||
|
||||
Returns:
|
||||
a `youtube_dl.YoutubeDL` instance
|
||||
a yt-dlp `youtube_dl.YoutubeDL` instance
|
||||
'''
|
||||
|
||||
class _YoutubeDL(youtube_dl.YoutubeDL):
|
||||
@ -160,7 +146,13 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
# videos. We unroll that iterator here partly because
|
||||
# otherwise `process_ie_result()` will clobber it, and we
|
||||
# use it later to extract the watch pages as outlinks.
|
||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||
try:
|
||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
"failed to unroll ie_result['entries']? for %s, %s; exception %s",
|
||||
ie.IE_NAME, url, e)
|
||||
ie_result['entries_no_dl'] =[]
|
||||
ie_result['entries'] = []
|
||||
self.logger.info(
|
||||
'not downloading %s media files from this '
|
||||
@ -171,7 +163,8 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
self.logger.info(
|
||||
'extractor %r found a download in %s', ie.IE_NAME, url)
|
||||
|
||||
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, ctx):
|
||||
def _push_stitched_up_vid_to_warcprox(self, site, info_dict):
|
||||
# 220211 update: does yt-dlp supply content-type?
|
||||
# XXX Don't know how to get the right content-type. Youtube-dl
|
||||
# doesn't supply it. Sometimes (with --hls-prefer-native)
|
||||
# youtube-dl produces a stitched-up video that /usr/bin/file fails
|
||||
@ -182,7 +175,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
else:
|
||||
try:
|
||||
import magic
|
||||
mimetype = magic.from_file(ctx['filename'], mime=True)
|
||||
mimetype = magic.from_file(info_dict['filepath'], mime=True)
|
||||
except ImportError as e:
|
||||
mimetype = 'video/%s' % info_dict['ext']
|
||||
self.logger.warning(
|
||||
@ -191,12 +184,12 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
url = 'youtube-dl:%05d:%s' % (
|
||||
info_dict.get('playlist_index') or 1,
|
||||
info_dict['webpage_url'])
|
||||
size = os.path.getsize(ctx['filename'])
|
||||
size = os.path.getsize(info_dict['filepath'])
|
||||
self.logger.info(
|
||||
'pushing %r video stitched-up as %s (%s bytes) to '
|
||||
'warcprox at %s with url %s', info_dict['format'],
|
||||
mimetype, size, worker._proxy_for(site), url)
|
||||
with open(ctx['filename'], 'rb') as f:
|
||||
with open(info_dict['filepath'], 'rb') as f:
|
||||
# include content-length header to avoid chunked
|
||||
# transfer, which warcprox currently rejects
|
||||
extra_headers = dict(site.extra_headers())
|
||||
@ -206,28 +199,15 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
warc_type='resource', content_type=mimetype, payload=f,
|
||||
extra_headers=extra_headers)
|
||||
# consulted by _remember_videos()
|
||||
self.stitch_ups.append({
|
||||
ydl.stitch_ups.append({
|
||||
'url': url,
|
||||
'response_code': response.code,
|
||||
'content-type': mimetype,
|
||||
'content-length': size,
|
||||
})
|
||||
|
||||
def process_info(self, info_dict):
|
||||
'''
|
||||
See comment above on `_finish_frag_download()`
|
||||
'''
|
||||
def ffd_callback(ffd_self, ctx):
|
||||
if worker._using_warcprox(site):
|
||||
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
|
||||
try:
|
||||
thread_local.finish_frag_download_callback = ffd_callback
|
||||
return super().process_info(info_dict)
|
||||
finally:
|
||||
delattr(thread_local, 'finish_frag_download_callback')
|
||||
|
||||
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
||||
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
||||
# in case yt-dlp takes a long time, heartbeat site.last_claimed
|
||||
# to prevent another brozzler-worker from claiming the site
|
||||
try:
|
||||
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
|
||||
@ -241,29 +221,40 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
'problem heartbeating site.last_claimed site id=%r',
|
||||
site.id, exc_info=True)
|
||||
|
||||
def ydl_postprocess_hook(d):
|
||||
if d['status'] == 'finished':
|
||||
print('[ydl_postprocess_hook] Done postprocessing')
|
||||
if worker._using_warcprox(site):
|
||||
_YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'])
|
||||
|
||||
ydl_opts = {
|
||||
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
||||
"retries": 1,
|
||||
"nocheckcertificate": True,
|
||||
"hls_prefer_native": True,
|
||||
"noplaylist": True,
|
||||
"noprogress": True,
|
||||
"nopart": True,
|
||||
"no_color": True,
|
||||
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
||||
"postprocessor_hooks": [ydl_postprocess_hook],
|
||||
|
||||
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
|
||||
# "best: Select the best quality format represented by a single
|
||||
# file with video and audio."
|
||||
"format": "best/bestvideo+bestaudio",
|
||||
# https://github.com/yt-dlp/yt-dlp#format-selection
|
||||
# "By default, yt-dlp tries to download the best available quality..."
|
||||
# https://github.com/yt-dlp/yt-dlp#sorting-formats
|
||||
# "You can change the criteria for being considered the best by using -S (--format-sort)...."
|
||||
# "vext: Video Extension (mp4 > webm > flv > other). If --prefer-free-formats is used, webm is preferred."
|
||||
# "aext: Audio Extension (m4a > aac > mp3 > ogg > opus > webm > other)."
|
||||
# "If --prefer-free-formats is used, the order changes to opus > ogg > webm > m4a > mp3 > aac."
|
||||
# "ext: Equivalent to vext,aext"
|
||||
"format_sort": ["ext"],
|
||||
|
||||
# --cache-dir local or...
|
||||
"cache_dir": False,
|
||||
|
||||
### we do our own logging
|
||||
# "logger": logging.getLogger("youtube_dl"),
|
||||
"verbose": False,
|
||||
"quiet": True,
|
||||
"verbose": True,
|
||||
"quiet": False,
|
||||
}
|
||||
if worker._proxy_for(site):
|
||||
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
|
||||
@ -277,7 +268,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
|
||||
def _remember_videos(page, fetches, stitch_ups=None):
|
||||
'''
|
||||
Saves info about videos captured by youtube-dl in `page.videos`.
|
||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||
'''
|
||||
if not 'videos' in page:
|
||||
page.videos = []
|
||||
@ -317,19 +308,20 @@ def _remember_videos(page, fetches, stitch_ups=None):
|
||||
|
||||
def _try_youtube_dl(worker, ydl, site, page):
|
||||
try:
|
||||
logging.info("trying youtube-dl on %s", page)
|
||||
logging.info("trying yt-dlp on %s", page)
|
||||
|
||||
with brozzler.thread_accept_exceptions():
|
||||
# we do whatwg canonicalization here to avoid "<urlopen error
|
||||
# no host given>" resulting in ProxyError
|
||||
# needs automated test
|
||||
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
||||
# and yt-dlp needs sanitize_info for extract_info
|
||||
ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
|
||||
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
|
||||
if worker._using_warcprox(site):
|
||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||
logging.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||
"with youtube-dl json for %s", page)
|
||||
"with yt-dlp json for %s", page)
|
||||
worker._warcprox_write_record(
|
||||
warcprox_address=worker._proxy_for(site),
|
||||
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||
@ -353,14 +345,14 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
and worker._proxy_for(site)):
|
||||
# connection problem when using a proxy == proxy error (XXX?)
|
||||
raise brozzler.ProxyError(
|
||||
'youtube-dl hit apparent proxy error from '
|
||||
'yt-dlp hit apparent proxy error from '
|
||||
'%s' % page.url) from e
|
||||
else:
|
||||
raise
|
||||
|
||||
def do_youtube_dl(worker, site, page):
|
||||
'''
|
||||
Runs youtube-dl configured for `worker` and `site` to download videos from
|
||||
Runs yt-dlp configured for `worker` and `site` to download videos from
|
||||
`page`.
|
||||
|
||||
Args:
|
||||
|
Loading…
x
Reference in New Issue
Block a user