move youtube-dl code into separate file

This commit is contained in:
Noah Levitt 2018-08-14 15:10:48 -07:00
parent 39155ebcc5
commit 33520da8f9
2 changed files with 336 additions and 257 deletions

View File

@ -23,92 +23,18 @@ import brozzler
import brozzler.browser import brozzler.browser
import threading import threading
import time import time
import youtube_dl
import urllib.request import urllib.request
import json import json
import PIL.Image import PIL.Image
import io import io
import socket import socket
import collections
import requests import requests
import doublethink import doublethink
import tempfile import tempfile
import urlcanon import urlcanon
from requests.structures import CaseInsensitiveDict from requests.structures import CaseInsensitiveDict
import rethinkdb as r import rethinkdb as r
import datetime from . import ydl
import urllib.parse
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
def _webpage_read_content(self, *args, **kwargs):
content = _orig_webpage_read_content(self, *args, **kwargs)
if len(content) > 20000000:
logging.warn(
'bypassing youtube-dl extraction because content is '
'too large (%s characters)', len(content))
return ''
return content
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
self.extra_headers = extra_headers
self.http_request = self._http_request
self.https_request = self._http_request
def _http_request(self, req):
for h, v in self.extra_headers.items():
if h.capitalize() not in req.headers:
req.add_header(h, v)
return req
class YoutubeDLSpy(urllib.request.BaseHandler):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self):
self.reset()
def _http_response(self, request, response):
txn = {
'url': request.full_url,
'method': request.get_method(),
'status_code': response.code,
'response_headers': response.headers,
}
self.transactions.append(txn)
return response
http_response = https_response = _http_response
def reset(self):
self.transactions = []
def final_bounces(self, url):
"""
Resolves redirect chains in self.transactions, returns a list of
Transaction representing the final redirect destinations of the given
url. There could be more than one if for example youtube-dl hit the
same url with HEAD and then GET requests.
"""
redirects = {}
for txn in self.transactions:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if 'location' in txn['response_headers']:
redirects[txn['url']] = txn
final_url = url
while final_url in redirects:
txn = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
txn['url'], txn['response_headers']['location'])
final_bounces = []
for txn in self.transactions:
if txn['url'] == final_url:
final_bounces.append(txn)
return final_bounces
class BrozzlerWorker: class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -204,99 +130,6 @@ class BrozzlerWorker:
# is warcprox # is warcprox
return bool(site.proxy or self._warcprox_auto) return bool(site.proxy or self._warcprox_auto)
def _youtube_dl(self, destdir, site):
class _YoutubeDL(youtube_dl.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
def get_info_extractor(self, ie_key):
ie = super().get_info_extractor(ie_key)
self.logger.info('youtube-dl using extractor %s', ie)
return ie
# def process_ie_result(
# ydl_self, ie_result, download=True, extra_info={}):
# ie_result = super().process_ie_result(
# ie_result, download, extra_info)
# return ie_result
def process_info(ydl_self, info_dict):
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
def _finish_frag_download(ffd_self, ctx):
_orig__finish_frag_download(ffd_self, ctx)
if self._using_warcprox(site):
try:
import magic
mimetype = magic.from_file(
ctx['filename'], mime=True)
except ImportError as e:
mimetype = 'video/%s' % info_dict['ext']
ydl_self.logger.warn(
'guessing mimetype %s because %r',
mimetype, e)
url = 'youtube-dl:%05d:%s' % (
info_dict['playlist_index'],
info_dict['webpage_url'])
ydl_self.logger.info(
'pushing %r video stitched-up as %s (%s '
'bytes) to warcprox at %s with url %s',
info_dict['format'], mimetype,
ctx['complete_frags_downloaded_bytes'],
self._proxy_for(site), url)
with open(ctx['filename'], 'rb') as f:
# include content-length header to avoid chunked
# transfer, which warcprox currently does not
# accept
# XXX is `ctx['complete_frags_downloaded_bytes']`
# always == `os.path.getsize(ctx['filename'])`?
self._warcprox_write_record(
warcprox_address=self._proxy_for(site),
url=url, warc_type='resource',
content_type=mimetype, payload=f,
extra_headers={'content-length': ctx['complete_frags_downloaded_bytes']})
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
return super().process_info(info_dict)
def ydl_progress(*args, **kwargs):
# in case youtube-dl takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site
try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES):
self.logger.debug(
'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id)
site.last_claimed = doublethink.utcnow()
site.save()
except:
self.logger.debug(
'problem heartbeating site.last_claimed site id=%r',
site.id, exc_info=True)
ydl_opts = {
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False,
"retries": 1,
"logger": logging.getLogger("youtube_dl"),
"nocheckcertificate": True,
"hls_prefer_native": True,
"noprogress": True,
"nopart": True,
"no_color": True,
"progress_hooks": [ydl_progress],
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
# "best: Select the best quality format represented by a single
# file with video and audio."
"format": "best/bestvideo+bestaudio",
}
if self._proxy_for(site):
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
ydl = _YoutubeDL(ydl_opts)
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
ydl.brozzler_spy = YoutubeDLSpy()
ydl._opener.add_handler(ydl.brozzler_spy)
return ydl
def _warcprox_write_record( def _warcprox_write_record(
self, warcprox_address, url, warc_type, content_type, self, warcprox_address, url, warc_type, content_type,
payload, extra_headers=None): payload, extra_headers=None):
@ -318,11 +151,13 @@ class BrozzlerWorker:
'got "%s %s" response on warcprox ' 'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)', 'WARCPROX_WRITE_RECORD request (expected 204)',
response.getcode(), response.reason) response.getcode(), response.reason)
return request, response
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
self.logger.warn( self.logger.warn(
'got "%s %s" response on warcprox ' 'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)', 'WARCPROX_WRITE_RECORD request (expected 204)',
e.getcode(), e.info()) e.getcode(), e.info())
return request, None
except urllib.error.URLError as e: except urllib.error.URLError as e:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
@ -330,80 +165,6 @@ class BrozzlerWorker:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
def _remember_videos(self, page, ydl_spy):
if not 'videos' in page:
page.videos = []
for txn in ydl_spy.transactions:
content_type = txn['response_headers'].get_content_type()
if (content_type.startswith('video/')
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and content_type != 'video/vnd.mpeg.dash.mpd'
and txn['method'] == 'GET'
and txn['status_code'] in (200, 206)):
video = {
'blame': 'youtube-dl',
'url': txn['url'],
'response_code': txn['status_code'],
'content-type': content_type,
}
if 'content-length' in txn['response_headers']:
video['content-length'] = int(
txn['response_headers']['content-length'])
if 'content-range' in txn['response_headers']:
video['content-range'] = txn[
'response_headers']['content-range']
logging.debug('embedded video %s', video)
page.videos.append(video)
def _try_youtube_dl(self, ydl, site, page):
try:
self.logger.info("trying youtube-dl on {}".format(page))
with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
# needs automated test
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
# ie_result = ydl.extract_info(
# str(urlcanon.whatwg(page.url)), download=False)
# if ie_result.get('_type') in ('playlist', 'multi_video'):
# ie_result = self._ydl_playlist(ie_result)
# else:
# ie_result = process_ie_result(ie_result, download=True)
self._remember_videos(page, ydl.brozzler_spy)
if self._using_warcprox(site):
info_json = json.dumps(info, sort_keys=True, indent=4)
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with youtube-dl json for %s", page)
self._warcprox_write_record(
warcprox_address=self._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers())
except brozzler.ShutdownRequested as e:
raise
except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and self._proxy_for(site)):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'youtube-dl hit apparent proxy error from '
'%s' % page.url) from e
else:
raise
def full_and_thumb_jpegs(self, large_png): def full_and_thumb_jpegs(self, large_png):
# these screenshots never have any alpha (right?) # these screenshots never have any alpha (right?)
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB') img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
@ -424,12 +185,10 @@ class BrozzlerWorker:
def brozzle_page(self, browser, site, page, on_screenshot=None, def brozzle_page(self, browser, site, page, on_screenshot=None,
on_request=None, enable_youtube_dl=True): on_request=None, enable_youtube_dl=True):
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
ydl_fetches = None
if enable_youtube_dl: if enable_youtube_dl:
try: try:
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl_fetches = ydl.do_youtube_dl(self, site, page)
ydl = self._youtube_dl(tempdir, site)
ydl_spy = ydl.brozzler_spy # remember for later
self._try_youtube_dl(ydl, site, page)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
raise raise
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
@ -447,16 +206,14 @@ class BrozzlerWorker:
self.logger.error( self.logger.error(
'youtube_dl raised exception on %s', page, 'youtube_dl raised exception on %s', page,
exc_info=True) exc_info=True)
else:
ydl_spy = False
if self._needs_browsing(page, ydl_spy): if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page) self.logger.info('needs browsing: %s', page)
outlinks = self._browse_page(browser, site, page, on_screenshot, outlinks = self._browse_page(browser, site, page, on_screenshot,
on_request) on_request)
return outlinks return outlinks
else: else:
if not self._already_fetched(page, ydl_spy): if not self._already_fetched(page, ydl_fetches):
self.logger.info('needs fetch: %s', page) self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page) self._fetch_url(site, page)
else: else:
@ -550,9 +307,9 @@ class BrozzlerWorker:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'proxy error fetching %s' % page.url) from e 'proxy error fetching %s' % page.url) from e
def _needs_browsing(self, page, brozzler_spy): def _needs_browsing(self, page, ydl_fetches):
if brozzler_spy: if ydl_fetches:
final_bounces = brozzler_spy.final_bounces(page.url) final_bounces = ydl.final_bounces(ydl_fetches, page.url)
if not final_bounces: if not final_bounces:
return True return True
for txn in final_bounces: for txn in final_bounces:
@ -563,9 +320,9 @@ class BrozzlerWorker:
else: else:
return True return True
def _already_fetched(self, page, brozzler_spy): def _already_fetched(self, page, ydl_fetches):
if brozzler_spy: if ydl_fetches:
for txn in brozzler_spy.final_bounces(page.url): for txn in final_bounces(ydl_fetches, page.url):
if (txn['method'] == 'GET' and txn['status_code'] == 200): if (txn['method'] == 'GET' and txn['status_code'] == 200):
return True return True
return False return False
@ -582,7 +339,7 @@ class BrozzlerWorker:
# _proxy_for() call in log statement can raise brozzler.ProxyError # _proxy_for() call in log statement can raise brozzler.ProxyError
# which is why we honor time limit and stop request first☝🏻 # which is why we honor time limit and stop request first☝🏻
self.logger.info( self.logger.info(
"brozzling site (proxy=%r) %r", "brozzling site (proxy=%r) %s",
self._proxy_for(site), site) self._proxy_for(site), site)
while time.time() - start < self.SITE_SESSION_MINUTES * 60: while time.time() - start < self.SITE_SESSION_MINUTES * 60:
site.refresh() site.refresh()

322
brozzler/ydl.py Normal file
View File

@ -0,0 +1,322 @@
'''
brozzler/ydl.py - youtube-dl support for brozzler
This code was extracted from worker.py and
Copyright (C) 2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import logging
import youtube_dl
import brozzler
import urllib.request
import tempfile
import urlcanon
import os
import json
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
def _webpage_read_content(self, *args, **kwargs):
content = _orig_webpage_read_content(self, *args, **kwargs)
if len(content) > 20000000:
logging.warn(
'bypassing youtube-dl extraction because content is '
'too large (%s characters)', len(content))
return ''
return content
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
self.extra_headers = extra_headers
self.http_request = self._http_request
self.https_request = self._http_request
def _http_request(self, req):
for h, v in self.extra_headers.items():
if h.capitalize() not in req.headers:
req.add_header(h, v)
return req
class YoutubeDLSpy(urllib.request.BaseHandler):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self):
self.reset()
def _http_response(self, request, response):
fetch = {
'url': request.full_url,
'method': request.get_method(),
'response_code': response.code,
'response_headers': response.headers,
}
self.fetches.append(fetch)
return response
http_response = https_response = _http_response
def reset(self):
self.fetches = []
def final_bounces(fetches, url):
"""
Resolves redirect chains in `fetches` and returns a list of fetches
representing the final redirect destinations of the given url. There could
be more than one if for example youtube-dl hit the same url with HEAD and
then GET requests.
"""
redirects = {}
for fetch in fetches:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if 'location' in fetch['response_headers']:
redirects[fetch['url']] = fetch
final_url = url
while final_url in redirects:
fetch = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
fetch['url'], fetch['response_headers']['location'])
final_bounces = []
for fetch in fetches:
if fetch['url'] == final_url:
final_bounces.append(fetch)
return final_bounces
def _build_youtube_dl(worker, destdir, site):
'''
Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
The `YoutubeDL` instance does a few special brozzler-specific things:
- keeps track of urls fetched using a `YoutubeDLSpy`
- periodically updates `site.last_claimed` in rethinkdb
- if brozzling through warcprox and downloading fragmented (DASH) videos,
pushes the stitched together video to warcprox using a
WARCPROX_WRITE_RECORD request
Args:
worker (brozzler.BrozzlerWorker): the calling brozzler worker
destdir (str): where to save downloaded videos
site (brozzler.Site): the site we are brozzling
Returns:
a `youtube_dl.YoutubeDL` instance
'''
class _YoutubeDL(youtube_dl.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
super().add_default_extra_info(ie_result, ie, url)
if ie_result.get('_type') == 'playlist':
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
else:
self.logger.info(
'extractor %r found a video in %s', ie.IE_NAME, url)
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, ctx):
try:
import magic
mimetype = magic.from_file(ctx['filename'], mime=True)
except ImportError as e:
mimetype = 'video/%s' % info_dict['ext']
self.logger.warn('guessing mimetype %s because %r', mimetype, e)
url = 'youtube-dl:%05d:%s' % (
info_dict.get('playlist_index') or 1,
info_dict['webpage_url'])
size = os.path.getsize(ctx['filename'])
self.logger.info(
'pushing %r video stitched-up as %s (%s bytes) to '
'warcprox at %s with url %s', info_dict['format'],
mimetype, size, worker._proxy_for(site), url)
with open(ctx['filename'], 'rb') as f:
# include content-length header to avoid chunked
# transfer, which warcprox currently rejects
request, response = worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site), url=url,
warc_type='resource', content_type=mimetype, payload=f,
extra_headers={'content-length': size})
# consulted by _remember_videos()
self.stitch_ups.append({
'url': url,
'response_code': response.code,
'content-type': mimetype,
'content-length': size,
})
def process_info(self, info_dict):
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
def _finish_frag_download(ffd_self, ctx):
_orig__finish_frag_download(ffd_self, ctx)
if worker._using_warcprox(site):
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
return super().process_info(info_dict)
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
# in case youtube-dl takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site
try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
worker.logger.debug(
'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id)
site.last_claimed = doublethink.utcnow()
site.save()
except:
worker.logger.debug(
'problem heartbeating site.last_claimed site id=%r',
site.id, exc_info=True)
ydl_opts = {
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False,
"retries": 1,
"logger": logging.getLogger("youtube_dl"),
"nocheckcertificate": True,
"hls_prefer_native": True,
"noprogress": True,
"nopart": True,
"no_color": True,
"progress_hooks": [maybe_heartbeat_site_last_claimed],
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
# "best: Select the best quality format represented by a single
# file with video and audio."
"format": "best/bestvideo+bestaudio",
}
if worker._proxy_for(site):
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
ydl = _YoutubeDL(ydl_opts)
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
ydl.fetch_spy = YoutubeDLSpy()
ydl.stitch_ups = []
ydl._opener.add_handler(ydl.fetch_spy)
return ydl
def _remember_videos(page, fetches, stitch_ups=None):
'''
Saves info about videos captured by youtube-dl in `page.videos`.
'''
if not 'videos' in page:
page.videos = []
for fetch in fetches or []:
content_type = fetch['response_headers'].get_content_type()
if (content_type.startswith('video/')
# skip manifests of DASH segmented video -
# see https://github.com/internetarchive/brozzler/pull/70
and content_type != 'video/vnd.mpeg.dash.mpd'
and fetch['method'] == 'GET'
and fetch['response_code'] in (200, 206)):
video = {
'blame': 'youtube-dl',
'url': fetch['url'],
'response_code': fetch['response_code'],
'content-type': content_type,
}
if 'content-length' in fetch['response_headers']:
video['content-length'] = int(
fetch['response_headers']['content-length'])
if 'content-range' in fetch['response_headers']:
video['content-range'] = fetch[
'response_headers']['content-range']
logging.debug('embedded video %s', video)
page.videos.append(video)
for stitch_up in stitch_ups or []:
if stitch_up['content-type'].startswith('video/'):
video = {
'blame': 'youtube-dl',
'url': stitch_up['url'],
'response_code': stitch_up['response_code'],
'content-type': stitch_up['content-type'],
'content-length': stitch_up['content-length'],
}
logging.debug('embedded video %s', video)
page.videos.append(video)
def _try_youtube_dl(worker, ydl, site, page):
try:
logging.info("trying youtube-dl on %s", page)
with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
# needs automated test
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logging.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with youtube-dl json for %s", page)
worker._warcprox_write_record(
warcprox_address=worker._proxy_for(site),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers())
except brozzler.ShutdownRequested as e:
raise
except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and worker._proxy_for(site)):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'youtube-dl hit apparent proxy error from '
'%s' % page.url) from e
else:
raise
def do_youtube_dl(worker, site, page):
'''
Runs youtube-dl configured for `worker` and `site` to download videos from
`page`.
Args:
worker (brozzler.BrozzlerWorker): the calling brozzler worker
site (brozzler.Site): the site we are brozzling
page (brozzler.Page): the page we are brozzling
Returns:
`list` of `dict`: with info about urls fetched:
[{
'url': ...,
'method': ...,
'response_code': ...,
'response_headers': ...,
}, ...]
'''
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site)
_try_youtube_dl(worker, ydl, site, page)
return ydl.fetch_spy.fetches