Merge pull request #115 from nlevitt/ydl-stitched

Ydl stitched
This commit is contained in:
jkafader 2018-09-06 16:15:52 -07:00 committed by GitHub
commit 8368cd2bcb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 441 additions and 216 deletions

View File

@ -23,92 +23,18 @@ import brozzler
import brozzler.browser
import threading
import time
import youtube_dl
import urllib.request
import json
import PIL.Image
import io
import socket
import collections
import requests
import doublethink
import tempfile
import urlcanon
from requests.structures import CaseInsensitiveDict
import rethinkdb as r
import datetime
import urllib.parse
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
def _webpage_read_content(self, *args, **kwargs):
content = _orig_webpage_read_content(self, *args, **kwargs)
if len(content) > 20000000:
logging.warn(
'bypassing youtube-dl extraction because content is '
'too large (%s characters)', len(content))
return ''
return content
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
self.extra_headers = extra_headers
self.http_request = self._http_request
self.https_request = self._http_request
def _http_request(self, req):
for h, v in self.extra_headers.items():
if h.capitalize() not in req.headers:
req.add_header(h, v)
return req
class YoutubeDLSpy(urllib.request.BaseHandler):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self):
self.reset()
def _http_response(self, request, response):
txn = {
'url': request.full_url,
'method': request.get_method(),
'status_code': response.code,
'response_headers': response.headers,
}
self.transactions.append(txn)
return response
http_response = https_response = _http_response
def reset(self):
self.transactions = []
def final_bounces(self, url):
"""
Resolves redirect chains in self.transactions, returns a list of
Transaction representing the final redirect destinations of the given
url. There could be more than one if for example youtube-dl hit the
same url with HEAD and then GET requests.
"""
redirects = {}
for txn in self.transactions:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if 'location' in txn['response_headers']:
redirects[txn['url']] = txn
final_url = url
while final_url in redirects:
txn = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
txn['url'], txn['response_headers']['location'])
final_bounces = []
for txn in self.transactions:
if txn['url'] == final_url:
final_bounces.append(txn)
return final_bounces
from . import ydl
class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -199,54 +125,11 @@ class BrozzlerWorker:
'IS' if self._proxy_is_warcprox else 'IS NOT')
return self._proxy_is_warcprox
else:
# I should have commented when I originally wrote this code, but I
# think this works because `site.proxy` is only set when the proxy
# is warcprox
return bool(site.proxy or self._warcprox_auto)
def _youtube_dl(self, destdir, site):
def ydl_progress(*args, **kwargs):
# in case youtube-dl takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site
try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES):
self.logger.debug(
'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id)
site.last_claimed = doublethink.utcnow()
site.save()
except:
self.logger.debug(
'problem heartbeating site.last_claimed site id=%r',
site.id, exc_info=True)
ydl_opts = {
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False,
"retries": 1,
"logger": logging.getLogger("youtube_dl"),
"nocheckcertificate": True,
"hls_prefer_native": True,
"noprogress": True,
"nopart": True,
"no_color": True,
"progress_hooks": [ydl_progress],
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
# "best: Select the best quality format represented by a single
# file with video and audio."
"format": "best/bestvideo+bestaudio",
}
if self._proxy_for(site):
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
## XXX (sometimes?) causes chrome debug websocket to go through
## proxy. Maybe not needed thanks to hls_prefer_native.
## # see https://github.com/rg3/youtube-dl/issues/6087
## os.environ["http_proxy"] = "http://{}".format(self._proxy_for(site))
ydl = youtube_dl.YoutubeDL(ydl_opts)
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
ydl.brozzler_spy = YoutubeDLSpy()
ydl._opener.add_handler(ydl.brozzler_spy)
return ydl
def _warcprox_write_record(
self, warcprox_address, url, warc_type, content_type,
payload, extra_headers=None):
@ -268,11 +151,13 @@ class BrozzlerWorker:
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
response.getcode(), response.reason)
return request, response
except urllib.error.HTTPError as e:
self.logger.warn(
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
e.getcode(), e.info())
return request, None
except urllib.error.URLError as e:
raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
@ -280,75 +165,6 @@ class BrozzlerWorker:
raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
def _remember_videos(self, page, ydl_spy):
if not 'videos' in page:
page.videos = []
for txn in ydl_spy.transactions:
content_type = txn['response_headers'].get_content_type()
if (content_type.startswith('video/')
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and content_type != 'video/vnd.mpeg.dash.mpd'
and txn['method'] == 'GET'
and txn['status_code'] in (200, 206)):
video = {
'blame': 'youtube-dl',
'url': txn['url'],
'response_code': txn['status_code'],
'content-type': content_type,
}
if 'content-length' in txn['response_headers']:
video['content-length'] = int(
txn['response_headers']['content-length'])
if 'content-range' in txn['response_headers']:
video['content-range'] = txn[
'response_headers']['content-range']
logging.debug('embedded video %s', video)
page.videos.append(video)
def _try_youtube_dl(self, ydl, site, page):
try:
self.logger.info("trying youtube-dl on {}".format(page))
with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
# needs automated test
info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
self._remember_videos(page, ydl.brozzler_spy)
# logging.info('XXX %s', json.dumps(info))
if self._using_warcprox(site):
info_json = json.dumps(info, sort_keys=True, indent=4)
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with youtube-dl json for %s", page)
self._warcprox_write_record(
warcprox_address=self._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers())
except brozzler.ShutdownRequested as e:
raise
except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and self._proxy_for(site)):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'youtube-dl hit apparent proxy error from '
'%s' % page.url) from e
else:
raise
def full_and_thumb_jpegs(self, large_png):
# these screenshots never have any alpha (right?)
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
@ -369,12 +185,10 @@ class BrozzlerWorker:
def brozzle_page(self, browser, site, page, on_screenshot=None,
on_request=None, enable_youtube_dl=True):
self.logger.info("brozzling {}".format(page))
ydl_fetches = None
if enable_youtube_dl:
try:
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = self._youtube_dl(tempdir, site)
ydl_spy = ydl.brozzler_spy # remember for later
self._try_youtube_dl(ydl, site, page)
ydl_fetches = ydl.do_youtube_dl(self, site, page)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
@ -392,16 +206,14 @@ class BrozzlerWorker:
self.logger.error(
'youtube_dl raised exception on %s', page,
exc_info=True)
else:
ydl_spy = False
if self._needs_browsing(page, ydl_spy):
if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page)
outlinks = self._browse_page(browser, site, page, on_screenshot,
on_request)
return outlinks
else:
if not self._already_fetched(page, ydl_spy):
if not self._already_fetched(page, ydl_fetches):
self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page)
else:
@ -495,9 +307,9 @@ class BrozzlerWorker:
raise brozzler.ProxyError(
'proxy error fetching %s' % page.url) from e
def _needs_browsing(self, page, brozzler_spy):
if brozzler_spy:
final_bounces = brozzler_spy.final_bounces(page.url)
def _needs_browsing(self, page, ydl_fetches):
if ydl_fetches:
final_bounces = ydl.final_bounces(ydl_fetches, page.url)
if not final_bounces:
return True
for txn in final_bounces:
@ -508,10 +320,10 @@ class BrozzlerWorker:
else:
return True
def _already_fetched(self, page, brozzler_spy):
if brozzler_spy:
for txn in brozzler_spy.final_bounces(page.url):
if (txn['method'] == 'GET' and txn['status_code'] == 200):
def _already_fetched(self, page, ydl_fetches):
if ydl_fetches:
for fetch in ydl.final_bounces(ydl_fetches, page.url):
if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
return True
return False
@ -527,7 +339,7 @@ class BrozzlerWorker:
# _proxy_for() call in log statement can raise brozzler.ProxyError
# which is why we honor time limit and stop request first☝🏻
self.logger.info(
"brozzling site (proxy=%r) %r",
"brozzling site (proxy=%r) %s",
self._proxy_for(site), site)
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
site.refresh()
@ -576,7 +388,7 @@ class BrozzlerWorker:
# using brozzler-worker --proxy, nothing to do but try the
# same proxy again next time
logging.error(
'proxy error (site.proxy=%r): %r', site.proxy, e)
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
except:
self.logger.critical("unexpected exception", exc_info=True)
finally:

334
brozzler/ydl.py Normal file
View File

@ -0,0 +1,334 @@
'''
brozzler/ydl.py - youtube-dl support for brozzler
This code was extracted from worker.py and
Copyright (C) 2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import logging
import youtube_dl
import brozzler
import urllib.request
import tempfile
import urlcanon
import os
import json
import doublethink
import datetime
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
def _webpage_read_content(self, *args, **kwargs):
content = _orig_webpage_read_content(self, *args, **kwargs)
if len(content) > 20000000:
logging.warn(
'bypassing youtube-dl extraction because content is '
'too large (%s characters)', len(content))
return ''
return content
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
self.extra_headers = extra_headers
self.http_request = self._http_request
self.https_request = self._http_request
def _http_request(self, req):
for h, v in self.extra_headers.items():
if h.capitalize() not in req.headers:
req.add_header(h, v)
return req
class YoutubeDLSpy(urllib.request.BaseHandler):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self):
self.reset()
def _http_response(self, request, response):
fetch = {
'url': request.full_url,
'method': request.get_method(),
'response_code': response.code,
'response_headers': response.headers,
}
self.fetches.append(fetch)
return response
http_response = https_response = _http_response
def reset(self):
self.fetches = []
def final_bounces(fetches, url):
"""
Resolves redirect chains in `fetches` and returns a list of fetches
representing the final redirect destinations of the given url. There could
be more than one if for example youtube-dl hit the same url with HEAD and
then GET requests.
"""
redirects = {}
for fetch in fetches:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if 'location' in fetch['response_headers']:
redirects[fetch['url']] = fetch
final_url = url
while final_url in redirects:
fetch = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
fetch['url'], fetch['response_headers']['location'])
final_bounces = []
for fetch in fetches:
if fetch['url'] == final_url:
final_bounces.append(fetch)
return final_bounces
def _build_youtube_dl(worker, destdir, site):
'''
Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
The `YoutubeDL` instance does a few special brozzler-specific things:
- keeps track of urls fetched using a `YoutubeDLSpy`
- periodically updates `site.last_claimed` in rethinkdb
- if brozzling through warcprox and downloading fragmented (DASH) videos,
pushes the stitched together video to warcprox using a
WARCPROX_WRITE_RECORD request
Args:
worker (brozzler.BrozzlerWorker): the calling brozzler worker
destdir (str): where to save downloaded videos
site (brozzler.Site): the site we are brozzling
Returns:
a `youtube_dl.YoutubeDL` instance
'''
class _YoutubeDL(youtube_dl.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
super().add_default_extra_info(ie_result, ie, url)
if ie_result.get('_type') == 'playlist':
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
else:
self.logger.info(
'extractor %r found a video in %s', ie.IE_NAME, url)
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, ctx):
# XXX Don't know how to get the right content-type. Youtube-dl
# doesn't supply it. Sometimes (with --hls-prefer-native)
# youtube-dl produces a stitched-up video that /usr/bin/file fails
# to identify (says "application/octet-stream"). `ffprobe` doesn't
# give us a mimetype.
if info_dict.get('ext') == 'mp4':
mimetype = 'video/mp4'
else:
try:
import magic
mimetype = magic.from_file(ctx['filename'], mime=True)
except ImportError as e:
mimetype = 'video/%s' % info_dict['ext']
self.logger.warn(
'guessing mimetype %s because %r', mimetype, e)
url = 'youtube-dl:%05d:%s' % (
info_dict.get('playlist_index') or 1,
info_dict['webpage_url'])
size = os.path.getsize(ctx['filename'])
self.logger.info(
'pushing %r video stitched-up as %s (%s bytes) to '
'warcprox at %s with url %s', info_dict['format'],
mimetype, size, worker._proxy_for(site), url)
with open(ctx['filename'], 'rb') as f:
# include content-length header to avoid chunked
# transfer, which warcprox currently rejects
request, response = worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site), url=url,
warc_type='resource', content_type=mimetype, payload=f,
extra_headers={'content-length': size})
# consulted by _remember_videos()
self.stitch_ups.append({
'url': url,
'response_code': response.code,
'content-type': mimetype,
'content-length': size,
})
def process_info(self, info_dict):
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
def _finish_frag_download(ffd_self, ctx):
_orig__finish_frag_download(ffd_self, ctx)
if worker._using_warcprox(site):
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
return super().process_info(info_dict)
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
# in case youtube-dl takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site
try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
worker.logger.debug(
'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id)
site.last_claimed = doublethink.utcnow()
site.save()
except:
worker.logger.debug(
'problem heartbeating site.last_claimed site id=%r',
site.id, exc_info=True)
ydl_opts = {
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False,
"retries": 1,
"logger": logging.getLogger("youtube_dl"),
"nocheckcertificate": True,
"hls_prefer_native": True,
"noprogress": True,
"nopart": True,
"no_color": True,
"progress_hooks": [maybe_heartbeat_site_last_claimed],
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
# "best: Select the best quality format represented by a single
# file with video and audio."
"format": "best/bestvideo+bestaudio",
}
if worker._proxy_for(site):
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
ydl = _YoutubeDL(ydl_opts)
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
ydl.fetch_spy = YoutubeDLSpy()
ydl.stitch_ups = []
ydl._opener.add_handler(ydl.fetch_spy)
return ydl
def _remember_videos(page, fetches, stitch_ups=None):
'''
Saves info about videos captured by youtube-dl in `page.videos`.
'''
if not 'videos' in page:
page.videos = []
for fetch in fetches or []:
content_type = fetch['response_headers'].get_content_type()
if (content_type.startswith('video/')
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and content_type != 'video/vnd.mpeg.dash.mpd'
and fetch['method'] == 'GET'
and fetch['response_code'] in (200, 206)):
video = {
'blame': 'youtube-dl',
'url': fetch['url'],
'response_code': fetch['response_code'],
'content-type': content_type,
}
if 'content-length' in fetch['response_headers']:
video['content-length'] = int(
fetch['response_headers']['content-length'])
if 'content-range' in fetch['response_headers']:
video['content-range'] = fetch[
'response_headers']['content-range']
logging.debug('embedded video %s', video)
page.videos.append(video)
for stitch_up in stitch_ups or []:
if stitch_up['content-type'].startswith('video/'):
video = {
'blame': 'youtube-dl',
'url': stitch_up['url'],
'response_code': stitch_up['response_code'],
'content-type': stitch_up['content-type'],
'content-length': stitch_up['content-length'],
}
logging.debug('embedded video %s', video)
page.videos.append(video)
def _try_youtube_dl(worker, ydl, site, page):
try:
logging.info("trying youtube-dl on %s", page)
with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
# needs automated test
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with youtube-dl json for %s", page)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers())
except brozzler.ShutdownRequested as e:
raise
except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and worker._proxy_for(site)):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'youtube-dl hit apparent proxy error from '
'%s' % page.url) from e
else:
raise
def do_youtube_dl(worker, site, page):
'''
Runs youtube-dl configured for `worker` and `site` to download videos from
`page`.
Args:
worker (brozzler.BrozzlerWorker): the calling brozzler worker
site (brozzler.Site): the site we are brozzling
page (brozzler.Page): the page we are brozzling
Returns:
`list` of `dict`: with info about urls fetched:
[{
'url': ...,
'method': ...,
'response_code': ...,
'response_headers': ...,
}, ...]
'''
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site)
_try_youtube_dl(worker, ydl, site, page)
return ydl.fetch_spy.fetches

View File

@ -75,6 +75,7 @@ setuptools.setup(
'cerberus>=1.0.1',
'jinja2>=2.10',
'cryptography>=2.3',
'python-magic>=0.4.15',
],
extras_require={
'dashboard': [

View File

@ -0,0 +1,34 @@
<html>
<head>
<title>segmented (hls) video test</title>
</head>
<body>
<!--
hls segments and manifest generated like so:
ffmpeg -i ../site6/small.mp4 -c:v h264 -flags +cgop -g 30 -hls_time 1 small.m3u8
-->
<!--
hls doesn't work in chrome with plain video tag without js, but we
don't care because we're testing youtube-dl functionality
-->
<video id="video" controls muted>
<source src="small.m3u8" type="application/x-mpegURL">
</video>
<!-- to make this work in chrome you need this -->
<!--
<script src="hls.js"></script>
<script>
if(Hls.isSupported()) {
var video = document.getElementById('video');
var hls = new Hls();
hls.loadSource('small.m3u8');
hls.attachMedia(video);
hls.on(Hls.Events.MANIFEST_PARSED,function() {
video.play();
});
}
</script>
-->
</body>
</html>

View File

@ -0,0 +1,15 @@
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-TARGETDURATION:1
#EXT-X-MEDIA-SEQUENCE:1
#EXTINF:1.000000,
small1.ts
#EXTINF:1.000000,
small2.ts
#EXTINF:1.000000,
small3.ts
#EXTINF:1.000000,
small4.ts
#EXTINF:0.533333,
small5.ts
#EXT-X-ENDLIST

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -3,7 +3,7 @@
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
warcprox, pywb, rethinkdb and other dependencies to be running already
Copyright (C) 2016-2017 Internet Archive
Copyright (C) 2016-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -796,3 +796,31 @@ def test_time_limit(httpd):
job.refresh()
assert job.status == 'FINISHED'
def test_ydl_stitching(httpd):
test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site10/' % httpd.server_port})
brozzler.new_site(frontier, site)
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site.refresh()
assert site.status == 'FINISHED'
# check page.videos
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
page = pages[0]
assert len(page.videos) == 6
assert {
'blame': 'youtube-dl',
'content-length': 267900,
'content-type': 'video/mp4',
'response_code': 204,
'url': 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port,
} in page.videos

View File

@ -23,6 +23,7 @@ import threading
import os
import brozzler
import brozzler.chrome
import brozzler.ydl
import logging
import yaml
import datetime
@ -227,9 +228,8 @@ def test_proxy_down():
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = worker._youtube_dl(tempdir, site)
with pytest.raises(brozzler.ProxyError):
worker._try_youtube_dl(ydl, site, page)
brozzler.ydl.do_youtube_dl(worker, site, page)
# raw fetch
with pytest.raises(brozzler.ProxyError):
@ -404,18 +404,19 @@ def test_needs_browsing():
page = brozzler.Page(None, {
'url':'http://example.com/a'})
spy = brozzler.worker.YoutubeDLSpy()
spy.transactions.append({
spy = brozzler.ydl.YoutubeDLSpy()
spy.fetches.append({
'url': 'http://example.com/a',
'method': 'HEAD',
'status_code': 301,
'response_code': 301,
'response_headers': ConvenientHeaders({'Location': '/b'})})
spy.transactions.append({
spy.fetches.append({
'url': 'http://example.com/b',
'method': 'GET',
'status_code': 200,
'response_code': 200,
'response_headers': ConvenientHeaders({
'Content-Type': 'application/pdf'})})
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy)
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
None, page, spy.fetches)