Merge pull request #115 from nlevitt/ydl-stitched

Ydl stitched
2025-08-12 16:25:34 -04:00 · 2018-09-06 16:15:52 -07:00 · 2018-09-06 16:15:52 -07:00 · 8368cd2bcb
commit 8368cd2bcb
parent a4eacb5b8f 8cdc3dee21
13 changed files with 441 additions and 216 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -23,92 +23,18 @@ import brozzler
 import brozzler.browser
 import threading
 import time
-import youtube_dl
 import urllib.request
 import json
 import PIL.Image
 import io
 import socket
-import collections
 import requests
 import doublethink
 import tempfile
 import urlcanon
 from requests.structures import CaseInsensitiveDict
 import rethinkdb as r
-import datetime
-import urllib.parse
-
-_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
-def _webpage_read_content(self, *args, **kwargs):
-    content = _orig_webpage_read_content(self, *args, **kwargs)
-    if len(content) > 20000000:
-        logging.warn(
-                'bypassing youtube-dl extraction because content is '
-                'too large (%s characters)', len(content))
-        return ''
-    return content
-youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
-
-class ExtraHeaderAdder(urllib.request.BaseHandler):
-    def __init__(self, extra_headers):
-        self.extra_headers = extra_headers
-        self.http_request = self._http_request
-        self.https_request = self._http_request
-
-    def _http_request(self, req):
-        for h, v in self.extra_headers.items():
-            if h.capitalize() not in req.headers:
-                req.add_header(h, v)
-        return req
-
-class YoutubeDLSpy(urllib.request.BaseHandler):
-    logger = logging.getLogger(__module__ + "." + __qualname__)
-
-    def __init__(self):
-        self.reset()
-
-    def _http_response(self, request, response):
-        txn = {
-            'url': request.full_url,
-            'method': request.get_method(),
-            'status_code': response.code,
-            'response_headers': response.headers,
-        }
-        self.transactions.append(txn)
-        return response
-
-    http_response = https_response = _http_response
-
-    def reset(self):
-        self.transactions = []
-
-    def final_bounces(self, url):
-        """
-        Resolves redirect chains in self.transactions, returns a list of
-        Transaction representing the final redirect destinations of the given
-        url. There could be more than one if for example youtube-dl hit the
-        same url with HEAD and then GET requests.
-        """
-        redirects = {}
-        for txn in self.transactions:
-             # XXX check http status 301,302,303,307? check for "uri" header
-             # as well as "location"? see urllib.request.HTTPRedirectHandler
-             if 'location' in txn['response_headers']:
-                 redirects[txn['url']] = txn
-
-        final_url = url
-        while final_url in redirects:
-            txn = redirects.pop(final_url)
-            final_url = urllib.parse.urljoin(
-                    txn['url'], txn['response_headers']['location'])
-
-        final_bounces = []
-        for txn in self.transactions:
-            if txn['url'] == final_url:
-                final_bounces.append(txn)
-
-        return final_bounces
+from . import ydl

 class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -199,54 +125,11 @@ class BrozzlerWorker:
                        'IS' if self._proxy_is_warcprox else 'IS NOT')
            return self._proxy_is_warcprox
        else:
+            # I should have commented when I originally wrote this code, but I
+            # think this works because `site.proxy` is only set when the proxy
+            # is warcprox
            return bool(site.proxy or self._warcprox_auto)

-
-    def _youtube_dl(self, destdir, site):
-        def ydl_progress(*args, **kwargs):
-            # in case youtube-dl takes a long time, heartbeat site.last_claimed
-            # to prevent another brozzler-worker from claiming the site
-            try:
-                if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES):
-                    self.logger.debug(
-                            'heartbeating site.last_claimed to prevent another '
-                            'brozzler-worker claiming this site id=%r', site.id)
-                    site.last_claimed = doublethink.utcnow()
-                    site.save()
-            except:
-                self.logger.debug(
-                        'problem heartbeating site.last_claimed site id=%r',
-                        site.id, exc_info=True)
-
-        ydl_opts = {
-            "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
-            "verbose": False,
-            "retries": 1,
-            "logger": logging.getLogger("youtube_dl"),
-            "nocheckcertificate": True,
-            "hls_prefer_native": True,
-            "noprogress": True,
-            "nopart": True,
-            "no_color": True,
-            "progress_hooks": [ydl_progress],
-             # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
-             # "best: Select the best quality format represented by a single
-             # file with video and audio."
-            "format": "best/bestvideo+bestaudio",
-        }
-        if self._proxy_for(site):
-            ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
-            ## XXX (sometimes?) causes chrome debug websocket to go through
-            ## proxy. Maybe not needed thanks to hls_prefer_native.
-            ## # see https://github.com/rg3/youtube-dl/issues/6087
-            ## os.environ["http_proxy"] = "http://{}".format(self._proxy_for(site))
-        ydl = youtube_dl.YoutubeDL(ydl_opts)
-        if site.extra_headers():
-            ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
-        ydl.brozzler_spy = YoutubeDLSpy()
-        ydl._opener.add_handler(ydl.brozzler_spy)
-        return ydl
-
    def _warcprox_write_record(
            self, warcprox_address, url, warc_type, content_type,
            payload, extra_headers=None):
@ -268,11 +151,13 @@ class BrozzlerWorker:
                            'got "%s %s" response on warcprox '
                            'WARCPROX_WRITE_RECORD request (expected 204)',
                            response.getcode(), response.reason)
+                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warn(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    e.getcode(), e.info())
+            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
@ -280,75 +165,6 @@ class BrozzlerWorker:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e

-    def _remember_videos(self, page, ydl_spy):
-        if not 'videos' in page:
-            page.videos = []
-        for txn in ydl_spy.transactions:
-            content_type = txn['response_headers'].get_content_type()
-            if (content_type.startswith('video/')
-                    # skip manifests of DASH segmented video -
-                    # see https://github.com/internetarchive/brozzler/pull/70
-                    and content_type != 'video/vnd.mpeg.dash.mpd'
-                    and txn['method'] == 'GET'
-                    and txn['status_code'] in (200, 206)):
-                video = {
-                    'blame': 'youtube-dl',
-                    'url': txn['url'],
-                    'response_code': txn['status_code'],
-                    'content-type': content_type,
-                }
-                if 'content-length' in txn['response_headers']:
-                    video['content-length'] = int(
-                            txn['response_headers']['content-length'])
-                if 'content-range' in txn['response_headers']:
-                    video['content-range'] = txn[
-                            'response_headers']['content-range']
-                logging.debug('embedded video %s', video)
-                page.videos.append(video)
-
-    def _try_youtube_dl(self, ydl, site, page):
-        try:
-            self.logger.info("trying youtube-dl on {}".format(page))
-
-            with brozzler.thread_accept_exceptions():
-                # we do whatwg canonicalization here to avoid "<urlopen error
-                # no host given>" resulting in ProxyError
-                # needs automated test
-                info = ydl.extract_info(str(urlcanon.whatwg(page.url)))
-            self._remember_videos(page, ydl.brozzler_spy)
-            # logging.info('XXX %s', json.dumps(info))
-            if self._using_warcprox(site):
-                info_json = json.dumps(info, sort_keys=True, indent=4)
-                self.logger.info(
-                        "sending WARCPROX_WRITE_RECORD request to warcprox "
-                        "with youtube-dl json for %s", page)
-                self._warcprox_write_record(
-                        warcprox_address=self._proxy_for(site),
-                        url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
-                        warc_type="metadata",
-                        content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
-                        payload=info_json.encode("utf-8"),
-                        extra_headers=site.extra_headers())
-        except brozzler.ShutdownRequested as e:
-            raise
-        except BaseException as e:
-            if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
-                pass
-            elif (hasattr(e, "exc_info")
-                    and e.exc_info[0] == urllib.error.HTTPError
-                    and hasattr(e.exc_info[1], "code")
-                    and e.exc_info[1].code == 420):
-                raise brozzler.ReachedLimit(e.exc_info[1])
-            elif (hasattr(e, 'exc_info')
-                    and e.exc_info[0] == urllib.error.URLError
-                    and self._proxy_for(site)):
-                # connection problem when using a proxy == proxy error (XXX?)
-                raise brozzler.ProxyError(
-                        'youtube-dl hit apparent proxy error from '
-                        '%s' % page.url) from e
-            else:
-                raise
-
    def full_and_thumb_jpegs(self, large_png):
        # these screenshots never have any alpha (right?)
        img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
@ -369,12 +185,10 @@ class BrozzlerWorker:
    def brozzle_page(self, browser, site, page, on_screenshot=None,
                     on_request=None, enable_youtube_dl=True):
        self.logger.info("brozzling {}".format(page))
+        ydl_fetches = None
        if enable_youtube_dl:
            try:
-                with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
-                    ydl = self._youtube_dl(tempdir, site)
-                    ydl_spy = ydl.brozzler_spy # remember for later
-                    self._try_youtube_dl(ydl, site, page)
+                ydl_fetches = ydl.do_youtube_dl(self, site, page)
            except brozzler.ReachedLimit as e:
                raise
            except brozzler.ShutdownRequested:
@ -392,16 +206,14 @@ class BrozzlerWorker:
                    self.logger.error(
                            'youtube_dl raised exception on %s', page,
                            exc_info=True)
-        else:
-            ydl_spy = False

-        if self._needs_browsing(page, ydl_spy):
+        if self._needs_browsing(page, ydl_fetches):
            self.logger.info('needs browsing: %s', page)
            outlinks = self._browse_page(browser, site, page, on_screenshot,
                                         on_request)
            return outlinks
        else:
-            if not self._already_fetched(page, ydl_spy):
+            if not self._already_fetched(page, ydl_fetches):
                self.logger.info('needs fetch: %s', page)
                self._fetch_url(site, page)
            else:
@ -495,9 +307,9 @@ class BrozzlerWorker:
            raise brozzler.ProxyError(
                    'proxy error fetching %s' % page.url) from e

-    def _needs_browsing(self, page, brozzler_spy):
-        if brozzler_spy:
-            final_bounces = brozzler_spy.final_bounces(page.url)
+    def _needs_browsing(self, page, ydl_fetches):
+        if ydl_fetches:
+            final_bounces = ydl.final_bounces(ydl_fetches, page.url)
            if not final_bounces:
                return True
            for txn in final_bounces:
@ -508,10 +320,10 @@ class BrozzlerWorker:
        else:
            return True

-    def _already_fetched(self, page, brozzler_spy):
-        if brozzler_spy:
-            for txn in brozzler_spy.final_bounces(page.url):
-                if (txn['method'] == 'GET' and txn['status_code'] == 200):
+    def _already_fetched(self, page, ydl_fetches):
+        if ydl_fetches:
+            for fetch in ydl.final_bounces(ydl_fetches, page.url):
+                if (fetch['method'] == 'GET' and fetch['response_code'] == 200):
                    return True
        return False

@ -527,7 +339,7 @@ class BrozzlerWorker:
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
            self.logger.info(
-                    "brozzling site (proxy=%r) %r",
+                    "brozzling site (proxy=%r) %s",
                    self._proxy_for(site), site)
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
@ -576,7 +388,7 @@ class BrozzlerWorker:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
                logging.error(
-                        'proxy error (site.proxy=%r): %r', site.proxy, e)
+                        'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
        except:
            self.logger.critical("unexpected exception", exc_info=True)
        finally:
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -0,0 +1,334 @@
+'''
+brozzler/ydl.py - youtube-dl support for brozzler
+
+This code was extracted from worker.py and 
+
+Copyright (C) 2018 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import logging
+import youtube_dl
+import brozzler
+import urllib.request
+import tempfile
+import urlcanon
+import os
+import json
+import doublethink
+import datetime
+
+_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
+def _webpage_read_content(self, *args, **kwargs):
+    content = _orig_webpage_read_content(self, *args, **kwargs)
+    if len(content) > 20000000:
+        logging.warn(
+                'bypassing youtube-dl extraction because content is '
+                'too large (%s characters)', len(content))
+        return ''
+    return content
+youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
+
+class ExtraHeaderAdder(urllib.request.BaseHandler):
+    def __init__(self, extra_headers):
+        self.extra_headers = extra_headers
+        self.http_request = self._http_request
+        self.https_request = self._http_request
+
+    def _http_request(self, req):
+        for h, v in self.extra_headers.items():
+            if h.capitalize() not in req.headers:
+                req.add_header(h, v)
+        return req
+
+class YoutubeDLSpy(urllib.request.BaseHandler):
+    logger = logging.getLogger(__module__ + "." + __qualname__)
+
+    def __init__(self):
+        self.reset()
+
+    def _http_response(self, request, response):
+        fetch = {
+            'url': request.full_url,
+            'method': request.get_method(),
+            'response_code': response.code,
+            'response_headers': response.headers,
+        }
+        self.fetches.append(fetch)
+        return response
+
+    http_response = https_response = _http_response
+
+    def reset(self):
+        self.fetches = []
+
+def final_bounces(fetches, url):
+    """
+    Resolves redirect chains in `fetches` and returns a list of fetches
+    representing the final redirect destinations of the given url. There could
+    be more than one if for example youtube-dl hit the same url with HEAD and
+    then GET requests.
+    """
+    redirects = {}
+    for fetch in fetches:
+         # XXX check http status 301,302,303,307? check for "uri" header
+         # as well as "location"? see urllib.request.HTTPRedirectHandler
+         if 'location' in fetch['response_headers']:
+             redirects[fetch['url']] = fetch
+
+    final_url = url
+    while final_url in redirects:
+        fetch = redirects.pop(final_url)
+        final_url = urllib.parse.urljoin(
+                fetch['url'], fetch['response_headers']['location'])
+
+    final_bounces = []
+    for fetch in fetches:
+        if fetch['url'] == final_url:
+            final_bounces.append(fetch)
+
+    return final_bounces
+
+def _build_youtube_dl(worker, destdir, site):
+    '''
+    Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
+
+    The `YoutubeDL` instance does a few special brozzler-specific things:
+
+    - keeps track of urls fetched using a `YoutubeDLSpy`
+    - periodically updates `site.last_claimed` in rethinkdb
+    - if brozzling through warcprox and downloading fragmented (DASH) videos,
+      pushes the stitched together video to warcprox using a
+      WARCPROX_WRITE_RECORD request
+
+    Args:
+        worker (brozzler.BrozzlerWorker): the calling brozzler worker
+        destdir (str): where to save downloaded videos
+        site (brozzler.Site): the site we are brozzling
+
+    Returns:
+        a `youtube_dl.YoutubeDL` instance
+    '''
+
+    class _YoutubeDL(youtube_dl.YoutubeDL):
+        logger = logging.getLogger(__module__ + "." + __qualname__)
+
+        def add_default_extra_info(self, ie_result, ie, url):
+            # hook in some logging
+            super().add_default_extra_info(ie_result, ie, url)
+            if ie_result.get('_type') == 'playlist':
+                self.logger.info(
+                        'extractor %r found playlist in %s', ie.IE_NAME, url)
+            else:
+                self.logger.info(
+                        'extractor %r found a video in %s', ie.IE_NAME, url)
+
+        def _push_stitched_up_vid_to_warcprox(self, site, info_dict, ctx):
+            # XXX Don't know how to get the right content-type. Youtube-dl
+            # doesn't supply it. Sometimes (with --hls-prefer-native)
+            # youtube-dl produces a stitched-up video that /usr/bin/file fails
+            # to identify (says "application/octet-stream"). `ffprobe` doesn't
+            # give us a mimetype.
+            if info_dict.get('ext') == 'mp4':
+                mimetype = 'video/mp4'
+            else:
+                try:
+                    import magic
+                    mimetype = magic.from_file(ctx['filename'], mime=True)
+                except ImportError as e:
+                    mimetype = 'video/%s' % info_dict['ext']
+                    self.logger.warn(
+                            'guessing mimetype %s because %r', mimetype, e)
+
+            url = 'youtube-dl:%05d:%s' % (
+                    info_dict.get('playlist_index') or 1,
+                    info_dict['webpage_url'])
+            size = os.path.getsize(ctx['filename'])
+            self.logger.info(
+                    'pushing %r video stitched-up as %s (%s bytes) to '
+                    'warcprox at %s with url %s', info_dict['format'],
+                    mimetype, size, worker._proxy_for(site), url)
+            with open(ctx['filename'], 'rb') as f:
+                # include content-length header to avoid chunked
+                # transfer, which warcprox currently rejects
+                request, response = worker._warcprox_write_record(
+                        warcprox_address=worker._proxy_for(site), url=url,
+                        warc_type='resource', content_type=mimetype, payload=f,
+                        extra_headers={'content-length': size})
+                # consulted by _remember_videos()
+                self.stitch_ups.append({
+                    'url': url,
+                    'response_code': response.code,
+                    'content-type': mimetype,
+                    'content-length': size,
+                })
+
+        def process_info(self, info_dict):
+            _orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
+
+            def _finish_frag_download(ffd_self, ctx):
+                _orig__finish_frag_download(ffd_self, ctx)
+                if worker._using_warcprox(site):
+                    self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
+
+            youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
+            return super().process_info(info_dict)
+
+    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
+        # in case youtube-dl takes a long time, heartbeat site.last_claimed
+        # to prevent another brozzler-worker from claiming the site
+        try:
+            if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
+                worker.logger.debug(
+                        'heartbeating site.last_claimed to prevent another '
+                        'brozzler-worker claiming this site id=%r', site.id)
+                site.last_claimed = doublethink.utcnow()
+                site.save()
+        except:
+            worker.logger.debug(
+                    'problem heartbeating site.last_claimed site id=%r',
+                    site.id, exc_info=True)
+
+    ydl_opts = {
+        "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
+        "verbose": False,
+        "retries": 1,
+        "logger": logging.getLogger("youtube_dl"),
+        "nocheckcertificate": True,
+        "hls_prefer_native": True,
+        "noprogress": True,
+        "nopart": True,
+        "no_color": True,
+        "progress_hooks": [maybe_heartbeat_site_last_claimed],
+         # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
+         # "best: Select the best quality format represented by a single
+         # file with video and audio."
+        "format": "best/bestvideo+bestaudio",
+    }
+    if worker._proxy_for(site):
+        ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
+    ydl = _YoutubeDL(ydl_opts)
+    if site.extra_headers():
+        ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
+    ydl.fetch_spy = YoutubeDLSpy()
+    ydl.stitch_ups = []
+    ydl._opener.add_handler(ydl.fetch_spy)
+    return ydl
+
+def _remember_videos(page, fetches, stitch_ups=None):
+    '''
+    Saves info about videos captured by youtube-dl in `page.videos`.
+    '''
+    if not 'videos' in page:
+        page.videos = []
+    for fetch in fetches or []:
+        content_type = fetch['response_headers'].get_content_type()
+        if (content_type.startswith('video/')
+                # skip manifests of DASH segmented video -
+                # see https://github.com/internetarchive/brozzler/pull/70
+                and content_type != 'video/vnd.mpeg.dash.mpd'
+                and fetch['method'] == 'GET'
+                and fetch['response_code'] in (200, 206)):
+            video = {
+                'blame': 'youtube-dl',
+                'url': fetch['url'],
+                'response_code': fetch['response_code'],
+                'content-type': content_type,
+            }
+            if 'content-length' in fetch['response_headers']:
+                video['content-length'] = int(
+                        fetch['response_headers']['content-length'])
+            if 'content-range' in fetch['response_headers']:
+                video['content-range'] = fetch[
+                        'response_headers']['content-range']
+            logging.debug('embedded video %s', video)
+            page.videos.append(video)
+    for stitch_up in stitch_ups or []:
+        if stitch_up['content-type'].startswith('video/'):
+            video = {
+                'blame': 'youtube-dl',
+                'url': stitch_up['url'],
+                'response_code': stitch_up['response_code'],
+                'content-type': stitch_up['content-type'],
+                'content-length': stitch_up['content-length'],
+            }
+            logging.debug('embedded video %s', video)
+            page.videos.append(video)
+
+def _try_youtube_dl(worker, ydl, site, page):
+    try:
+        logging.info("trying youtube-dl on %s", page)
+
+        with brozzler.thread_accept_exceptions():
+            # we do whatwg canonicalization here to avoid "<urlopen error
+            # no host given>" resulting in ProxyError
+            # needs automated test
+            ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
+        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
+        if worker._using_warcprox(site):
+            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
+            logging.info(
+                    "sending WARCPROX_WRITE_RECORD request to warcprox "
+                    "with youtube-dl json for %s", page)
+            worker._warcprox_write_record(
+                    warcprox_address=worker._proxy_for(site),
+                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
+                    warc_type="metadata",
+                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
+                    payload=info_json.encode("utf-8"),
+                    extra_headers=site.extra_headers())
+    except brozzler.ShutdownRequested as e:
+        raise
+    except BaseException as e:
+        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
+            pass
+        elif (hasattr(e, "exc_info")
+                and e.exc_info[0] == urllib.error.HTTPError
+                and hasattr(e.exc_info[1], "code")
+                and e.exc_info[1].code == 420):
+            raise brozzler.ReachedLimit(e.exc_info[1])
+        elif (hasattr(e, 'exc_info')
+                and e.exc_info[0] == urllib.error.URLError
+                and worker._proxy_for(site)):
+            # connection problem when using a proxy == proxy error (XXX?)
+            raise brozzler.ProxyError(
+                    'youtube-dl hit apparent proxy error from '
+                    '%s' % page.url) from e
+        else:
+            raise
+
+def do_youtube_dl(worker, site, page):
+    '''
+    Runs youtube-dl configured for `worker` and `site` to download videos from
+    `page`.
+
+    Args:
+        worker (brozzler.BrozzlerWorker): the calling brozzler worker
+        site (brozzler.Site): the site we are brozzling
+        page (brozzler.Page): the page we are brozzling
+
+    Returns:
+        `list` of `dict`: with info about urls fetched:
+
+            [{
+                'url': ...,
+                'method': ...,
+                'response_code': ...,
+                'response_headers': ...,
+            }, ...]
+    '''
+    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+        ydl = _build_youtube_dl(worker, tempdir, site)
+        _try_youtube_dl(worker, ydl, site, page)
+        return ydl.fetch_spy.fetches
--- a/setup.py
+++ b/setup.py
@ -75,6 +75,7 @@ setuptools.setup(
            'cerberus>=1.0.1',
            'jinja2>=2.10',
            'cryptography>=2.3',
+            'python-magic>=0.4.15',
        ],
        extras_require={
            'dashboard': [
--- a/tests/htdocs/site10/index.html
+++ b/tests/htdocs/site10/index.html
@ -0,0 +1,34 @@
+<html>
+    <head>
+        <title>segmented (hls) video test</title>
+    </head>
+    <body>
+        <!--
+            hls segments and manifest generated like so:
+            ffmpeg -i ../site6/small.mp4 -c:v h264 -flags +cgop -g 30 -hls_time 1 small.m3u8
+        -->
+        <!--
+            hls doesn't work in chrome with plain video tag without js, but we
+            don't care because we're testing youtube-dl functionality
+        -->
+        <video id="video" controls muted>
+            <source src="small.m3u8" type="application/x-mpegURL">
+        </video>
+
+        <!-- to make this work in chrome you need this -->
+        <!--
+        <script src="hls.js"></script>
+        <script>
+            if(Hls.isSupported()) {
+                var video = document.getElementById('video');
+                var hls = new Hls();
+                hls.loadSource('small.m3u8');
+                hls.attachMedia(video);
+                hls.on(Hls.Events.MANIFEST_PARSED,function() {
+                    video.play();
+                });
+            }
+        </script>
+        -->
+    </body>
+</html>
--- a/tests/htdocs/site10/small.m3u8
+++ b/tests/htdocs/site10/small.m3u8
@ -0,0 +1,15 @@
+#EXTM3U
+#EXT-X-VERSION:3
+#EXT-X-TARGETDURATION:1
+#EXT-X-MEDIA-SEQUENCE:1
+#EXTINF:1.000000,
+small1.ts
+#EXTINF:1.000000,
+small2.ts
+#EXTINF:1.000000,
+small3.ts
+#EXTINF:1.000000,
+small4.ts
+#EXTINF:0.533333,
+small5.ts
+#EXT-X-ENDLIST
--- a/tests/htdocs/site10/small0.ts
+++ b/tests/htdocs/site10/small0.ts
--- a/tests/htdocs/site10/small1.ts
+++ b/tests/htdocs/site10/small1.ts
--- a/tests/htdocs/site10/small2.ts
+++ b/tests/htdocs/site10/small2.ts
--- a/tests/htdocs/site10/small3.ts
+++ b/tests/htdocs/site10/small3.ts
--- a/tests/htdocs/site10/small4.ts
+++ b/tests/htdocs/site10/small4.ts
--- a/tests/htdocs/site10/small5.ts
+++ b/tests/htdocs/site10/small5.ts
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -3,7 +3,7 @@
 test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
 warcprox, pywb, rethinkdb and other dependencies to be running already

-Copyright (C) 2016-2017 Internet Archive
+Copyright (C) 2016-2018 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -796,3 +796,31 @@ def test_time_limit(httpd):
        job.refresh()
    assert job.status == 'FINISHED'

+def test_ydl_stitching(httpd):
+    test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat()
+    rr = doublethink.Rethinker('localhost', db='brozzler')
+    frontier = brozzler.RethinkDbFrontier(rr)
+    site = brozzler.Site(rr, {
+        'seed': 'http://localhost:%s/site10/' % httpd.server_port})
+    brozzler.new_site(frontier, site)
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site.refresh()
+    assert site.status == 'FINISHED'
+
+    # check page.videos
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    page = pages[0]
+    assert len(page.videos) == 6
+    assert {
+        'blame': 'youtube-dl',
+        'content-length': 267900,
+        'content-type': 'video/mp4',
+        'response_code': 204,
+        'url': 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port,
+    } in page.videos
+
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -23,6 +23,7 @@ import threading
 import os
 import brozzler
 import brozzler.chrome
+import brozzler.ydl
 import logging
 import yaml
 import datetime
@ -227,9 +228,8 @@ def test_proxy_down():

        # youtube-dl fetch
        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
-            ydl = worker._youtube_dl(tempdir, site)
            with pytest.raises(brozzler.ProxyError):
-                worker._try_youtube_dl(ydl, site, page)
+                brozzler.ydl.do_youtube_dl(worker, site, page)

        # raw fetch
        with pytest.raises(brozzler.ProxyError):
@ -404,18 +404,19 @@ def test_needs_browsing():
    page = brozzler.Page(None, {
        'url':'http://example.com/a'})

-    spy = brozzler.worker.YoutubeDLSpy()
-    spy.transactions.append({
+    spy = brozzler.ydl.YoutubeDLSpy()
+    spy.fetches.append({
        'url': 'http://example.com/a',
        'method': 'HEAD',
-        'status_code': 301,
+        'response_code': 301,
        'response_headers': ConvenientHeaders({'Location': '/b'})})
-    spy.transactions.append({
+    spy.fetches.append({
        'url': 'http://example.com/b',
        'method': 'GET',
-        'status_code': 200,
+        'response_code': 200,
        'response_headers': ConvenientHeaders({
            'Content-Type': 'application/pdf'})})

-    assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy)
+    assert not brozzler.worker.BrozzlerWorker._needs_browsing(
+            None, page, spy.fetches)