mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 12:24:20 -04:00
push youtube-dl's stitched up videos to warcprox
(no tests yet)
This commit is contained in:
parent
4e398e1da2
commit
39155ebcc5
2 changed files with 65 additions and 9 deletions
|
@ -199,10 +199,64 @@ class BrozzlerWorker:
|
||||||
'IS' if self._proxy_is_warcprox else 'IS NOT')
|
'IS' if self._proxy_is_warcprox else 'IS NOT')
|
||||||
return self._proxy_is_warcprox
|
return self._proxy_is_warcprox
|
||||||
else:
|
else:
|
||||||
|
# I should have commented when I originally wrote this code, but I
|
||||||
|
# think this works because `site.proxy` is only set when the proxy
|
||||||
|
# is warcprox
|
||||||
return bool(site.proxy or self._warcprox_auto)
|
return bool(site.proxy or self._warcprox_auto)
|
||||||
|
|
||||||
|
|
||||||
def _youtube_dl(self, destdir, site):
|
def _youtube_dl(self, destdir, site):
|
||||||
|
class _YoutubeDL(youtube_dl.YoutubeDL):
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def get_info_extractor(self, ie_key):
|
||||||
|
ie = super().get_info_extractor(ie_key)
|
||||||
|
self.logger.info('youtube-dl using extractor %s', ie)
|
||||||
|
return ie
|
||||||
|
|
||||||
|
# def process_ie_result(
|
||||||
|
# ydl_self, ie_result, download=True, extra_info={}):
|
||||||
|
# ie_result = super().process_ie_result(
|
||||||
|
# ie_result, download, extra_info)
|
||||||
|
# return ie_result
|
||||||
|
|
||||||
|
def process_info(ydl_self, info_dict):
|
||||||
|
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
|
||||||
|
def _finish_frag_download(ffd_self, ctx):
|
||||||
|
_orig__finish_frag_download(ffd_self, ctx)
|
||||||
|
if self._using_warcprox(site):
|
||||||
|
try:
|
||||||
|
import magic
|
||||||
|
mimetype = magic.from_file(
|
||||||
|
ctx['filename'], mime=True)
|
||||||
|
except ImportError as e:
|
||||||
|
mimetype = 'video/%s' % info_dict['ext']
|
||||||
|
ydl_self.logger.warn(
|
||||||
|
'guessing mimetype %s because %r',
|
||||||
|
mimetype, e)
|
||||||
|
url = 'youtube-dl:%05d:%s' % (
|
||||||
|
info_dict['playlist_index'],
|
||||||
|
info_dict['webpage_url'])
|
||||||
|
ydl_self.logger.info(
|
||||||
|
'pushing %r video stitched-up as %s (%s '
|
||||||
|
'bytes) to warcprox at %s with url %s',
|
||||||
|
info_dict['format'], mimetype,
|
||||||
|
ctx['complete_frags_downloaded_bytes'],
|
||||||
|
self._proxy_for(site), url)
|
||||||
|
with open(ctx['filename'], 'rb') as f:
|
||||||
|
# include content-length header to avoid chunked
|
||||||
|
# transfer, which warcprox currently does not
|
||||||
|
# accept
|
||||||
|
# XXX is `ctx['complete_frags_downloaded_bytes']`
|
||||||
|
# always == `os.path.getsize(ctx['filename'])`?
|
||||||
|
self._warcprox_write_record(
|
||||||
|
warcprox_address=self._proxy_for(site),
|
||||||
|
url=url, warc_type='resource',
|
||||||
|
content_type=mimetype, payload=f,
|
||||||
|
extra_headers={'content-length': ctx['complete_frags_downloaded_bytes']})
|
||||||
|
|
||||||
|
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
|
||||||
|
return super().process_info(info_dict)
|
||||||
|
|
||||||
def ydl_progress(*args, **kwargs):
|
def ydl_progress(*args, **kwargs):
|
||||||
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
||||||
# to prevent another brozzler-worker from claiming the site
|
# to prevent another brozzler-worker from claiming the site
|
||||||
|
@ -236,11 +290,7 @@ class BrozzlerWorker:
|
||||||
}
|
}
|
||||||
if self._proxy_for(site):
|
if self._proxy_for(site):
|
||||||
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
|
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
|
||||||
## XXX (sometimes?) causes chrome debug websocket to go through
|
ydl = _YoutubeDL(ydl_opts)
|
||||||
## proxy. Maybe not needed thanks to hls_prefer_native.
|
|
||||||
## # see https://github.com/rg3/youtube-dl/issues/6087
|
|
||||||
## os.environ["http_proxy"] = "http://{}".format(self._proxy_for(site))
|
|
||||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
|
||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
||||||
ydl.brozzler_spy = YoutubeDLSpy()
|
ydl.brozzler_spy = YoutubeDLSpy()
|
||||||
|
@ -314,9 +364,14 @@ class BrozzlerWorker:
|
||||||
# we do whatwg canonicalization here to avoid "<urlopen error
|
# we do whatwg canonicalization here to avoid "<urlopen error
|
||||||
# no host given>" resulting in ProxyError
|
# no host given>" resulting in ProxyError
|
||||||
# needs automated test
|
# needs automated test
|
||||||
info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
||||||
|
# ie_result = ydl.extract_info(
|
||||||
|
# str(urlcanon.whatwg(page.url)), download=False)
|
||||||
|
# if ie_result.get('_type') in ('playlist', 'multi_video'):
|
||||||
|
# ie_result = self._ydl_playlist(ie_result)
|
||||||
|
# else:
|
||||||
|
# ie_result = process_ie_result(ie_result, download=True)
|
||||||
self._remember_videos(page, ydl.brozzler_spy)
|
self._remember_videos(page, ydl.brozzler_spy)
|
||||||
# logging.info('XXX %s', json.dumps(info))
|
|
||||||
if self._using_warcprox(site):
|
if self._using_warcprox(site):
|
||||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
@ -576,7 +631,7 @@ class BrozzlerWorker:
|
||||||
# using brozzler-worker --proxy, nothing to do but try the
|
# using brozzler-worker --proxy, nothing to do but try the
|
||||||
# same proxy again next time
|
# same proxy again next time
|
||||||
logging.error(
|
logging.error(
|
||||||
'proxy error (site.proxy=%r): %r', site.proxy, e)
|
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
|
||||||
except:
|
except:
|
||||||
self.logger.critical("unexpected exception", exc_info=True)
|
self.logger.critical("unexpected exception", exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -75,6 +75,7 @@ setuptools.setup(
|
||||||
'cerberus==1.0.1',
|
'cerberus==1.0.1',
|
||||||
'jinja2',
|
'jinja2',
|
||||||
'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu
|
'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu
|
||||||
|
'python-magic',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue