move youtube-dl code into separate file

2025-09-23 06:04:47 -04:00 · 2018-08-14 15:10:48 -07:00 · 2018-08-14 15:10:48 -07:00 · 33520da8f9
commit 33520da8f9
parent 39155ebcc5
2 changed files with 336 additions and 257 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -23,92 +23,18 @@ import brozzler
 import brozzler.browser
 import threading
 import time
 import youtube_dl
 import urllib.request
 import json
 import PIL.Image
 import io
 import socket
 import collections
 import requests
 import doublethink
 import tempfile
 import urlcanon
 from requests.structures import CaseInsensitiveDict
 import rethinkdb as r
-import datetime
+from . import ydl
 import urllib.parse
 _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
 def _webpage_read_content(self, *args, **kwargs):
    content = _orig_webpage_read_content(self, *args, **kwargs)
    if len(content) > 20000000:
        logging.warn(
                'bypassing youtube-dl extraction because content is '
                'too large (%s characters)', len(content))
        return ''
    return content
 youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
 class ExtraHeaderAdder(urllib.request.BaseHandler):
    def __init__(self, extra_headers):
        self.extra_headers = extra_headers
        self.http_request = self._http_request
        self.https_request = self._http_request
    def _http_request(self, req):
        for h, v in self.extra_headers.items():
            if h.capitalize() not in req.headers:
                req.add_header(h, v)
        return req
 class YoutubeDLSpy(urllib.request.BaseHandler):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self):
        self.reset()
    def _http_response(self, request, response):
        txn = {
            'url': request.full_url,
            'method': request.get_method(),
            'status_code': response.code,
            'response_headers': response.headers,
        }
        self.transactions.append(txn)
        return response
    http_response = https_response = _http_response
    def reset(self):
        self.transactions = []
    def final_bounces(self, url):
        """
        Resolves redirect chains in self.transactions, returns a list of
        Transaction representing the final redirect destinations of the given
        url. There could be more than one if for example youtube-dl hit the
        same url with HEAD and then GET requests.
        """
        redirects = {}
        for txn in self.transactions:
             # XXX check http status 301,302,303,307? check for "uri" header
             # as well as "location"? see urllib.request.HTTPRedirectHandler
             if 'location' in txn['response_headers']:
                 redirects[txn['url']] = txn
        final_url = url
        while final_url in redirects:
            txn = redirects.pop(final_url)
            final_url = urllib.parse.urljoin(
                    txn['url'], txn['response_headers']['location'])
        final_bounces = []
        for txn in self.transactions:
            if txn['url'] == final_url:
                final_bounces.append(txn)
        return final_bounces
 class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -204,99 +130,6 @@ class BrozzlerWorker:
            # is warcprox
            return bool(site.proxy or self._warcprox_auto)
    def _youtube_dl(self, destdir, site):
        class _YoutubeDL(youtube_dl.YoutubeDL):
            logger = logging.getLogger(__module__ + "." + __qualname__)
            def get_info_extractor(self, ie_key):
                ie = super().get_info_extractor(ie_key)
                self.logger.info('youtube-dl using extractor %s', ie)
                return ie
            # def process_ie_result(
            #         ydl_self, ie_result, download=True, extra_info={}):
            #     ie_result = super().process_ie_result(
            #             ie_result, download, extra_info)
            #     return ie_result
            def process_info(ydl_self, info_dict):
                _orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
                def _finish_frag_download(ffd_self, ctx):
                    _orig__finish_frag_download(ffd_self, ctx)
                    if self._using_warcprox(site):
                        try:
                            import magic
                            mimetype = magic.from_file(
                                    ctx['filename'], mime=True)
                        except ImportError as e:
                            mimetype = 'video/%s' % info_dict['ext']
                            ydl_self.logger.warn(
                                    'guessing mimetype %s because %r',
                                    mimetype, e)
                        url = 'youtube-dl:%05d:%s' % (
                                info_dict['playlist_index'],
                                info_dict['webpage_url'])
                        ydl_self.logger.info(
                                'pushing %r video stitched-up as %s (%s '
                                'bytes) to warcprox at %s with url %s',
                                info_dict['format'], mimetype,
                                ctx['complete_frags_downloaded_bytes'],
                                self._proxy_for(site), url)
                        with open(ctx['filename'], 'rb') as f:
                            # include content-length header to avoid chunked
                            # transfer, which warcprox currently does not
                            # accept
                            # XXX is `ctx['complete_frags_downloaded_bytes']`
                            # always == `os.path.getsize(ctx['filename'])`?
                            self._warcprox_write_record(
                                    warcprox_address=self._proxy_for(site),
                                    url=url, warc_type='resource',
                                    content_type=mimetype, payload=f,
                                    extra_headers={'content-length': ctx['complete_frags_downloaded_bytes']})
                youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
                return super().process_info(info_dict)
        def ydl_progress(*args, **kwargs):
            # in case youtube-dl takes a long time, heartbeat site.last_claimed
            # to prevent another brozzler-worker from claiming the site
            try:
                if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES):
                    self.logger.debug(
                            'heartbeating site.last_claimed to prevent another '
                            'brozzler-worker claiming this site id=%r', site.id)
                    site.last_claimed = doublethink.utcnow()
                    site.save()
            except:
                self.logger.debug(
                        'problem heartbeating site.last_claimed site id=%r',
                        site.id, exc_info=True)
        ydl_opts = {
            "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
            "verbose": False,
            "retries": 1,
            "logger": logging.getLogger("youtube_dl"),
            "nocheckcertificate": True,
            "hls_prefer_native": True,
            "noprogress": True,
            "nopart": True,
            "no_color": True,
            "progress_hooks": [ydl_progress],
             # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
             # "best: Select the best quality format represented by a single
             # file with video and audio."
            "format": "best/bestvideo+bestaudio",
        }
        if self._proxy_for(site):
            ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
        ydl = _YoutubeDL(ydl_opts)
        if site.extra_headers():
            ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
        ydl.brozzler_spy = YoutubeDLSpy()
        ydl._opener.add_handler(ydl.brozzler_spy)
        return ydl
    def _warcprox_write_record(
            self, warcprox_address, url, warc_type, content_type,
            payload, extra_headers=None):
@ -318,11 +151,13 @@ class BrozzlerWorker:
                            'got "%s %s" response on warcprox '
                            'WARCPROX_WRITE_RECORD request (expected 204)',
                            response.getcode(), response.reason)
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warn(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    e.getcode(), e.info())
            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
@ -330,80 +165,6 @@ class BrozzlerWorker:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
    def _remember_videos(self, page, ydl_spy):
        if not 'videos' in page:
            page.videos = []
        for txn in ydl_spy.transactions:
            content_type = txn['response_headers'].get_content_type()
            if (content_type.startswith('video/')
                    # skip manifests of DASH segmented video -
                    # see https://github.com/internetarchive/brozzler/pull/70
                    and content_type != 'video/vnd.mpeg.dash.mpd'
                    and txn['method'] == 'GET'
                    and txn['status_code'] in (200, 206)):
                video = {
                    'blame': 'youtube-dl',
                    'url': txn['url'],
                    'response_code': txn['status_code'],
                    'content-type': content_type,
                }
                if 'content-length' in txn['response_headers']:
                    video['content-length'] = int(
                            txn['response_headers']['content-length'])
                if 'content-range' in txn['response_headers']:
                    video['content-range'] = txn[
                            'response_headers']['content-range']
                logging.debug('embedded video %s', video)
                page.videos.append(video)
    def _try_youtube_dl(self, ydl, site, page):
        try:
            self.logger.info("trying youtube-dl on {}".format(page))
            with brozzler.thread_accept_exceptions():
                # we do whatwg canonicalization here to avoid "<urlopen error
                # no host given>" resulting in ProxyError
                # needs automated test
                ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
                # ie_result = ydl.extract_info(
                #         str(urlcanon.whatwg(page.url)), download=False)
                # if ie_result.get('_type') in ('playlist', 'multi_video'):
                #     ie_result = self._ydl_playlist(ie_result)
                # else:
                #     ie_result = process_ie_result(ie_result, download=True)
            self._remember_videos(page, ydl.brozzler_spy)
            if self._using_warcprox(site):
                info_json = json.dumps(info, sort_keys=True, indent=4)
                self.logger.info(
                        "sending WARCPROX_WRITE_RECORD request to warcprox "
                        "with youtube-dl json for %s", page)
                self._warcprox_write_record(
                        warcprox_address=self._proxy_for(site),
                        url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                        warc_type="metadata",
                        content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                        payload=info_json.encode("utf-8"),
                        extra_headers=site.extra_headers())
        except brozzler.ShutdownRequested as e:
            raise
        except BaseException as e:
            if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
            elif (hasattr(e, "exc_info")
                    and e.exc_info[0] == urllib.error.HTTPError
                    and hasattr(e.exc_info[1], "code")
                    and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                    and e.exc_info[0] == urllib.error.URLError
                    and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                        'youtube-dl hit apparent proxy error from '
                        '%s' % page.url) from e
            else:
                raise
    def full_and_thumb_jpegs(self, large_png):
        # these screenshots never have any alpha (right?)
        img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
@ -424,12 +185,10 @@ class BrozzlerWorker:
    def brozzle_page(self, browser, site, page, on_screenshot=None,
                     on_request=None, enable_youtube_dl=True):
        self.logger.info("brozzling {}".format(page))
        ydl_fetches = None
        if enable_youtube_dl:
            try:
-                with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+                ydl_fetches = ydl.do_youtube_dl(self, site, page)
                    ydl = self._youtube_dl(tempdir, site)
                    ydl_spy = ydl.brozzler_spy # remember for later
                    self._try_youtube_dl(ydl, site, page)
            except brozzler.ReachedLimit as e:
                raise
            except brozzler.ShutdownRequested:
@ -447,16 +206,14 @@ class BrozzlerWorker:
                    self.logger.error(
                            'youtube_dl raised exception on %s', page,
                            exc_info=True)
        else:
            ydl_spy = False
-        if self._needs_browsing(page, ydl_spy):
+        if self._needs_browsing(page, ydl_fetches):
            self.logger.info('needs browsing: %s', page)
            outlinks = self._browse_page(browser, site, page, on_screenshot,
                                         on_request)
            return outlinks
        else:
-            if not self._already_fetched(page, ydl_spy):
+            if not self._already_fetched(page, ydl_fetches):
                self.logger.info('needs fetch: %s', page)
                self._fetch_url(site, page)
            else:
@ -550,9 +307,9 @@ class BrozzlerWorker:
            raise brozzler.ProxyError(
                    'proxy error fetching %s' % page.url) from e
-    def _needs_browsing(self, page, brozzler_spy):
+    def _needs_browsing(self, page, ydl_fetches):
-        if brozzler_spy:
+        if ydl_fetches:
-            final_bounces = brozzler_spy.final_bounces(page.url)
+            final_bounces = ydl.final_bounces(ydl_fetches, page.url)
            if not final_bounces:
                return True
            for txn in final_bounces:
@ -563,9 +320,9 @@ class BrozzlerWorker:
        else:
            return True
-    def _already_fetched(self, page, brozzler_spy):
+    def _already_fetched(self, page, ydl_fetches):
-        if brozzler_spy:
+        if ydl_fetches:
-            for txn in brozzler_spy.final_bounces(page.url):
+            for txn in final_bounces(ydl_fetches, page.url):
                if (txn['method'] == 'GET' and txn['status_code'] == 200):
                    return True
        return False
@ -582,7 +339,7 @@ class BrozzlerWorker:
            # _proxy_for() call in log statement can raise brozzler.ProxyError
            # which is why we honor time limit and stop request first☝🏻
            self.logger.info(
-                    "brozzling site (proxy=%r) %r",
+                    "brozzling site (proxy=%r) %s",
                    self._proxy_for(site), site)
            while time.time() - start < self.SITE_SESSION_MINUTES * 60:
                site.refresh()
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -0,0 +1,322 @@
 '''
 brozzler/ydl.py - youtube-dl support for brozzler
 This code was extracted from worker.py and 
 Copyright (C) 2018 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 '''
 import logging
 import youtube_dl
 import brozzler
 import urllib.request
 import tempfile
 import urlcanon
 import os
 import json
 _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
 def _webpage_read_content(self, *args, **kwargs):
    content = _orig_webpage_read_content(self, *args, **kwargs)
    if len(content) > 20000000:
        logging.warn(
                'bypassing youtube-dl extraction because content is '
                'too large (%s characters)', len(content))
        return ''
    return content
 youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
 class ExtraHeaderAdder(urllib.request.BaseHandler):
    def __init__(self, extra_headers):
        self.extra_headers = extra_headers
        self.http_request = self._http_request
        self.https_request = self._http_request
    def _http_request(self, req):
        for h, v in self.extra_headers.items():
            if h.capitalize() not in req.headers:
                req.add_header(h, v)
        return req
 class YoutubeDLSpy(urllib.request.BaseHandler):
    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self):
        self.reset()
    def _http_response(self, request, response):
        fetch = {
            'url': request.full_url,
            'method': request.get_method(),
            'response_code': response.code,
            'response_headers': response.headers,
        }
        self.fetches.append(fetch)
        return response
    http_response = https_response = _http_response
    def reset(self):
        self.fetches = []
 def final_bounces(fetches, url):
    """
    Resolves redirect chains in `fetches` and returns a list of fetches
    representing the final redirect destinations of the given url. There could
    be more than one if for example youtube-dl hit the same url with HEAD and
    then GET requests.
    """
    redirects = {}
    for fetch in fetches:
         # XXX check http status 301,302,303,307? check for "uri" header
         # as well as "location"? see urllib.request.HTTPRedirectHandler
         if 'location' in fetch['response_headers']:
             redirects[fetch['url']] = fetch
    final_url = url
    while final_url in redirects:
        fetch = redirects.pop(final_url)
        final_url = urllib.parse.urljoin(
                fetch['url'], fetch['response_headers']['location'])
    final_bounces = []
    for fetch in fetches:
        if fetch['url'] == final_url:
            final_bounces.append(fetch)
    return final_bounces
 def _build_youtube_dl(worker, destdir, site):
    '''
    Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
    The `YoutubeDL` instance does a few special brozzler-specific things:
    - keeps track of urls fetched using a `YoutubeDLSpy`
    - periodically updates `site.last_claimed` in rethinkdb
    - if brozzling through warcprox and downloading fragmented (DASH) videos,
      pushes the stitched together video to warcprox using a
      WARCPROX_WRITE_RECORD request
    Args:
        worker (brozzler.BrozzlerWorker): the calling brozzler worker
        destdir (str): where to save downloaded videos
        site (brozzler.Site): the site we are brozzling
    Returns:
        a `youtube_dl.YoutubeDL` instance
    '''
    class _YoutubeDL(youtube_dl.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
            if ie_result.get('_type') == 'playlist':
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
            else:
                self.logger.info(
                        'extractor %r found a video in %s', ie.IE_NAME, url)
        def _push_stitched_up_vid_to_warcprox(self, site, info_dict, ctx):
            try:
                import magic
                mimetype = magic.from_file(ctx['filename'], mime=True)
            except ImportError as e:
                mimetype = 'video/%s' % info_dict['ext']
                self.logger.warn('guessing mimetype %s because %r', mimetype, e)
            url = 'youtube-dl:%05d:%s' % (
                    info_dict.get('playlist_index') or 1,
                    info_dict['webpage_url'])
            size = os.path.getsize(ctx['filename'])
            self.logger.info(
                    'pushing %r video stitched-up as %s (%s bytes) to '
                    'warcprox at %s with url %s', info_dict['format'],
                    mimetype, size, worker._proxy_for(site), url)
            with open(ctx['filename'], 'rb') as f:
                # include content-length header to avoid chunked
                # transfer, which warcprox currently rejects
                request, response = worker._warcprox_write_record(
                        warcprox_address=worker._proxy_for(site), url=url,
                        warc_type='resource', content_type=mimetype, payload=f,
                        extra_headers={'content-length': size})
                # consulted by _remember_videos()
                self.stitch_ups.append({
                    'url': url,
                    'response_code': response.code,
                    'content-type': mimetype,
                    'content-length': size,
                })
        def process_info(self, info_dict):
            _orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
            def _finish_frag_download(ffd_self, ctx):
                _orig__finish_frag_download(ffd_self, ctx)
                if worker._using_warcprox(site):
                    self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
            youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
            return super().process_info(info_dict)
    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
        # in case youtube-dl takes a long time, heartbeat site.last_claimed
        # to prevent another brozzler-worker from claiming the site
        try:
            if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
                worker.logger.debug(
                        'heartbeating site.last_claimed to prevent another '
                        'brozzler-worker claiming this site id=%r', site.id)
                site.last_claimed = doublethink.utcnow()
                site.save()
        except:
            worker.logger.debug(
                    'problem heartbeating site.last_claimed site id=%r',
                    site.id, exc_info=True)
    ydl_opts = {
        "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
        "verbose": False,
        "retries": 1,
        "logger": logging.getLogger("youtube_dl"),
        "nocheckcertificate": True,
        "hls_prefer_native": True,
        "noprogress": True,
        "nopart": True,
        "no_color": True,
        "progress_hooks": [maybe_heartbeat_site_last_claimed],
         # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
         # "best: Select the best quality format represented by a single
         # file with video and audio."
        "format": "best/bestvideo+bestaudio",
    }
    if worker._proxy_for(site):
        ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
    ydl = _YoutubeDL(ydl_opts)
    if site.extra_headers():
        ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
    ydl.fetch_spy = YoutubeDLSpy()
    ydl.stitch_ups = []
    ydl._opener.add_handler(ydl.fetch_spy)
    return ydl
 def _remember_videos(page, fetches, stitch_ups=None):
    '''
    Saves info about videos captured by youtube-dl in `page.videos`.
    '''
    if not 'videos' in page:
        page.videos = []
    for fetch in fetches or []:
        content_type = fetch['response_headers'].get_content_type()
        if (content_type.startswith('video/')
                # skip manifests of DASH segmented video -
                # see https://github.com/internetarchive/brozzler/pull/70
                and content_type != 'video/vnd.mpeg.dash.mpd'
                and fetch['method'] == 'GET'
                and fetch['response_code'] in (200, 206)):
            video = {
                'blame': 'youtube-dl',
                'url': fetch['url'],
                'response_code': fetch['response_code'],
                'content-type': content_type,
            }
            if 'content-length' in fetch['response_headers']:
                video['content-length'] = int(
                        fetch['response_headers']['content-length'])
            if 'content-range' in fetch['response_headers']:
                video['content-range'] = fetch[
                        'response_headers']['content-range']
            logging.debug('embedded video %s', video)
            page.videos.append(video)
    for stitch_up in stitch_ups or []:
        if stitch_up['content-type'].startswith('video/'):
            video = {
                'blame': 'youtube-dl',
                'url': stitch_up['url'],
                'response_code': stitch_up['response_code'],
                'content-type': stitch_up['content-type'],
                'content-length': stitch_up['content-length'],
            }
            logging.debug('embedded video %s', video)
            page.videos.append(video)
 def _try_youtube_dl(worker, ydl, site, page):
    try:
        logging.info("trying youtube-dl on %s", page)
        with brozzler.thread_accept_exceptions():
            # we do whatwg canonicalization here to avoid "<urlopen error
            # no host given>" resulting in ProxyError
            # needs automated test
            ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(
                    "sending WARCPROX_WRITE_RECORD request to warcprox "
                    "with youtube-dl json for %s", page)
            worker._warcprox_write_record(
                    warcprox_address=worker._proxy_for(site),
                    url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
                    warc_type="metadata",
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
    except brozzler.ShutdownRequested as e:
        raise
    except BaseException as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
            pass
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
                and e.exc_info[1].code == 420):
            raise brozzler.ReachedLimit(e.exc_info[1])
        elif (hasattr(e, 'exc_info')
                and e.exc_info[0] == urllib.error.URLError
                and worker._proxy_for(site)):
            # connection problem when using a proxy == proxy error (XXX?)
            raise brozzler.ProxyError(
                    'youtube-dl hit apparent proxy error from '
                    '%s' % page.url) from e
        else:
            raise
 def do_youtube_dl(worker, site, page):
    '''
    Runs youtube-dl configured for `worker` and `site` to download videos from
    `page`.
    Args:
        worker (brozzler.BrozzlerWorker): the calling brozzler worker
        site (brozzler.Site): the site we are brozzling
        page (brozzler.Page): the page we are brozzling
    Returns:
        `list` of `dict`: with info about urls fetched:
            [{
                'url': ...,
                'method': ...,
                'response_code': ...,
                'response_headers': ...,
            }, ...]
    '''
    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
        ydl = _build_youtube_dl(worker, tempdir, site)
        _try_youtube_dl(worker, ydl, site, page)
        return ydl.fetch_spy.fetches