mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
move youtube-dl code into separate file
This commit is contained in:
parent
39155ebcc5
commit
33520da8f9
@ -23,92 +23,18 @@ import brozzler
|
|||||||
import brozzler.browser
|
import brozzler.browser
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import youtube_dl
|
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import json
|
import json
|
||||||
import PIL.Image
|
import PIL.Image
|
||||||
import io
|
import io
|
||||||
import socket
|
import socket
|
||||||
import collections
|
|
||||||
import requests
|
import requests
|
||||||
import doublethink
|
import doublethink
|
||||||
import tempfile
|
import tempfile
|
||||||
import urlcanon
|
import urlcanon
|
||||||
from requests.structures import CaseInsensitiveDict
|
from requests.structures import CaseInsensitiveDict
|
||||||
import rethinkdb as r
|
import rethinkdb as r
|
||||||
import datetime
|
from . import ydl
|
||||||
import urllib.parse
|
|
||||||
|
|
||||||
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
|
|
||||||
def _webpage_read_content(self, *args, **kwargs):
|
|
||||||
content = _orig_webpage_read_content(self, *args, **kwargs)
|
|
||||||
if len(content) > 20000000:
|
|
||||||
logging.warn(
|
|
||||||
'bypassing youtube-dl extraction because content is '
|
|
||||||
'too large (%s characters)', len(content))
|
|
||||||
return ''
|
|
||||||
return content
|
|
||||||
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
|
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
|
||||||
def __init__(self, extra_headers):
|
|
||||||
self.extra_headers = extra_headers
|
|
||||||
self.http_request = self._http_request
|
|
||||||
self.https_request = self._http_request
|
|
||||||
|
|
||||||
def _http_request(self, req):
|
|
||||||
for h, v in self.extra_headers.items():
|
|
||||||
if h.capitalize() not in req.headers:
|
|
||||||
req.add_header(h, v)
|
|
||||||
return req
|
|
||||||
|
|
||||||
class YoutubeDLSpy(urllib.request.BaseHandler):
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def _http_response(self, request, response):
|
|
||||||
txn = {
|
|
||||||
'url': request.full_url,
|
|
||||||
'method': request.get_method(),
|
|
||||||
'status_code': response.code,
|
|
||||||
'response_headers': response.headers,
|
|
||||||
}
|
|
||||||
self.transactions.append(txn)
|
|
||||||
return response
|
|
||||||
|
|
||||||
http_response = https_response = _http_response
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.transactions = []
|
|
||||||
|
|
||||||
def final_bounces(self, url):
|
|
||||||
"""
|
|
||||||
Resolves redirect chains in self.transactions, returns a list of
|
|
||||||
Transaction representing the final redirect destinations of the given
|
|
||||||
url. There could be more than one if for example youtube-dl hit the
|
|
||||||
same url with HEAD and then GET requests.
|
|
||||||
"""
|
|
||||||
redirects = {}
|
|
||||||
for txn in self.transactions:
|
|
||||||
# XXX check http status 301,302,303,307? check for "uri" header
|
|
||||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
|
||||||
if 'location' in txn['response_headers']:
|
|
||||||
redirects[txn['url']] = txn
|
|
||||||
|
|
||||||
final_url = url
|
|
||||||
while final_url in redirects:
|
|
||||||
txn = redirects.pop(final_url)
|
|
||||||
final_url = urllib.parse.urljoin(
|
|
||||||
txn['url'], txn['response_headers']['location'])
|
|
||||||
|
|
||||||
final_bounces = []
|
|
||||||
for txn in self.transactions:
|
|
||||||
if txn['url'] == final_url:
|
|
||||||
final_bounces.append(txn)
|
|
||||||
|
|
||||||
return final_bounces
|
|
||||||
|
|
||||||
class BrozzlerWorker:
|
class BrozzlerWorker:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
@ -204,99 +130,6 @@ class BrozzlerWorker:
|
|||||||
# is warcprox
|
# is warcprox
|
||||||
return bool(site.proxy or self._warcprox_auto)
|
return bool(site.proxy or self._warcprox_auto)
|
||||||
|
|
||||||
def _youtube_dl(self, destdir, site):
|
|
||||||
class _YoutubeDL(youtube_dl.YoutubeDL):
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
|
||||||
|
|
||||||
def get_info_extractor(self, ie_key):
|
|
||||||
ie = super().get_info_extractor(ie_key)
|
|
||||||
self.logger.info('youtube-dl using extractor %s', ie)
|
|
||||||
return ie
|
|
||||||
|
|
||||||
# def process_ie_result(
|
|
||||||
# ydl_self, ie_result, download=True, extra_info={}):
|
|
||||||
# ie_result = super().process_ie_result(
|
|
||||||
# ie_result, download, extra_info)
|
|
||||||
# return ie_result
|
|
||||||
|
|
||||||
def process_info(ydl_self, info_dict):
|
|
||||||
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
|
|
||||||
def _finish_frag_download(ffd_self, ctx):
|
|
||||||
_orig__finish_frag_download(ffd_self, ctx)
|
|
||||||
if self._using_warcprox(site):
|
|
||||||
try:
|
|
||||||
import magic
|
|
||||||
mimetype = magic.from_file(
|
|
||||||
ctx['filename'], mime=True)
|
|
||||||
except ImportError as e:
|
|
||||||
mimetype = 'video/%s' % info_dict['ext']
|
|
||||||
ydl_self.logger.warn(
|
|
||||||
'guessing mimetype %s because %r',
|
|
||||||
mimetype, e)
|
|
||||||
url = 'youtube-dl:%05d:%s' % (
|
|
||||||
info_dict['playlist_index'],
|
|
||||||
info_dict['webpage_url'])
|
|
||||||
ydl_self.logger.info(
|
|
||||||
'pushing %r video stitched-up as %s (%s '
|
|
||||||
'bytes) to warcprox at %s with url %s',
|
|
||||||
info_dict['format'], mimetype,
|
|
||||||
ctx['complete_frags_downloaded_bytes'],
|
|
||||||
self._proxy_for(site), url)
|
|
||||||
with open(ctx['filename'], 'rb') as f:
|
|
||||||
# include content-length header to avoid chunked
|
|
||||||
# transfer, which warcprox currently does not
|
|
||||||
# accept
|
|
||||||
# XXX is `ctx['complete_frags_downloaded_bytes']`
|
|
||||||
# always == `os.path.getsize(ctx['filename'])`?
|
|
||||||
self._warcprox_write_record(
|
|
||||||
warcprox_address=self._proxy_for(site),
|
|
||||||
url=url, warc_type='resource',
|
|
||||||
content_type=mimetype, payload=f,
|
|
||||||
extra_headers={'content-length': ctx['complete_frags_downloaded_bytes']})
|
|
||||||
|
|
||||||
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
|
|
||||||
return super().process_info(info_dict)
|
|
||||||
|
|
||||||
def ydl_progress(*args, **kwargs):
|
|
||||||
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
|
||||||
# to prevent another brozzler-worker from claiming the site
|
|
||||||
try:
|
|
||||||
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES):
|
|
||||||
self.logger.debug(
|
|
||||||
'heartbeating site.last_claimed to prevent another '
|
|
||||||
'brozzler-worker claiming this site id=%r', site.id)
|
|
||||||
site.last_claimed = doublethink.utcnow()
|
|
||||||
site.save()
|
|
||||||
except:
|
|
||||||
self.logger.debug(
|
|
||||||
'problem heartbeating site.last_claimed site id=%r',
|
|
||||||
site.id, exc_info=True)
|
|
||||||
|
|
||||||
ydl_opts = {
|
|
||||||
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
|
||||||
"verbose": False,
|
|
||||||
"retries": 1,
|
|
||||||
"logger": logging.getLogger("youtube_dl"),
|
|
||||||
"nocheckcertificate": True,
|
|
||||||
"hls_prefer_native": True,
|
|
||||||
"noprogress": True,
|
|
||||||
"nopart": True,
|
|
||||||
"no_color": True,
|
|
||||||
"progress_hooks": [ydl_progress],
|
|
||||||
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
|
|
||||||
# "best: Select the best quality format represented by a single
|
|
||||||
# file with video and audio."
|
|
||||||
"format": "best/bestvideo+bestaudio",
|
|
||||||
}
|
|
||||||
if self._proxy_for(site):
|
|
||||||
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
|
|
||||||
ydl = _YoutubeDL(ydl_opts)
|
|
||||||
if site.extra_headers():
|
|
||||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
|
||||||
ydl.brozzler_spy = YoutubeDLSpy()
|
|
||||||
ydl._opener.add_handler(ydl.brozzler_spy)
|
|
||||||
return ydl
|
|
||||||
|
|
||||||
def _warcprox_write_record(
|
def _warcprox_write_record(
|
||||||
self, warcprox_address, url, warc_type, content_type,
|
self, warcprox_address, url, warc_type, content_type,
|
||||||
payload, extra_headers=None):
|
payload, extra_headers=None):
|
||||||
@ -318,11 +151,13 @@ class BrozzlerWorker:
|
|||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||||
response.getcode(), response.reason)
|
response.getcode(), response.reason)
|
||||||
|
return request, response
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||||
e.getcode(), e.info())
|
e.getcode(), e.info())
|
||||||
|
return request, None
|
||||||
except urllib.error.URLError as e:
|
except urllib.error.URLError as e:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||||
@ -330,80 +165,6 @@ class BrozzlerWorker:
|
|||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||||
|
|
||||||
def _remember_videos(self, page, ydl_spy):
|
|
||||||
if not 'videos' in page:
|
|
||||||
page.videos = []
|
|
||||||
for txn in ydl_spy.transactions:
|
|
||||||
content_type = txn['response_headers'].get_content_type()
|
|
||||||
if (content_type.startswith('video/')
|
|
||||||
# skip manifests of DASH segmented video -
|
|
||||||
# see https://github.com/internetarchive/brozzler/pull/70
|
|
||||||
and content_type != 'video/vnd.mpeg.dash.mpd'
|
|
||||||
and txn['method'] == 'GET'
|
|
||||||
and txn['status_code'] in (200, 206)):
|
|
||||||
video = {
|
|
||||||
'blame': 'youtube-dl',
|
|
||||||
'url': txn['url'],
|
|
||||||
'response_code': txn['status_code'],
|
|
||||||
'content-type': content_type,
|
|
||||||
}
|
|
||||||
if 'content-length' in txn['response_headers']:
|
|
||||||
video['content-length'] = int(
|
|
||||||
txn['response_headers']['content-length'])
|
|
||||||
if 'content-range' in txn['response_headers']:
|
|
||||||
video['content-range'] = txn[
|
|
||||||
'response_headers']['content-range']
|
|
||||||
logging.debug('embedded video %s', video)
|
|
||||||
page.videos.append(video)
|
|
||||||
|
|
||||||
def _try_youtube_dl(self, ydl, site, page):
|
|
||||||
try:
|
|
||||||
self.logger.info("trying youtube-dl on {}".format(page))
|
|
||||||
|
|
||||||
with brozzler.thread_accept_exceptions():
|
|
||||||
# we do whatwg canonicalization here to avoid "<urlopen error
|
|
||||||
# no host given>" resulting in ProxyError
|
|
||||||
# needs automated test
|
|
||||||
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
|
||||||
# ie_result = ydl.extract_info(
|
|
||||||
# str(urlcanon.whatwg(page.url)), download=False)
|
|
||||||
# if ie_result.get('_type') in ('playlist', 'multi_video'):
|
|
||||||
# ie_result = self._ydl_playlist(ie_result)
|
|
||||||
# else:
|
|
||||||
# ie_result = process_ie_result(ie_result, download=True)
|
|
||||||
self._remember_videos(page, ydl.brozzler_spy)
|
|
||||||
if self._using_warcprox(site):
|
|
||||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
|
||||||
self.logger.info(
|
|
||||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
|
||||||
"with youtube-dl json for %s", page)
|
|
||||||
self._warcprox_write_record(
|
|
||||||
warcprox_address=self._proxy_for(site),
|
|
||||||
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
|
||||||
warc_type="metadata",
|
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
|
||||||
payload=info_json.encode("utf-8"),
|
|
||||||
extra_headers=site.extra_headers())
|
|
||||||
except brozzler.ShutdownRequested as e:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
|
||||||
pass
|
|
||||||
elif (hasattr(e, "exc_info")
|
|
||||||
and e.exc_info[0] == urllib.error.HTTPError
|
|
||||||
and hasattr(e.exc_info[1], "code")
|
|
||||||
and e.exc_info[1].code == 420):
|
|
||||||
raise brozzler.ReachedLimit(e.exc_info[1])
|
|
||||||
elif (hasattr(e, 'exc_info')
|
|
||||||
and e.exc_info[0] == urllib.error.URLError
|
|
||||||
and self._proxy_for(site)):
|
|
||||||
# connection problem when using a proxy == proxy error (XXX?)
|
|
||||||
raise brozzler.ProxyError(
|
|
||||||
'youtube-dl hit apparent proxy error from '
|
|
||||||
'%s' % page.url) from e
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
def full_and_thumb_jpegs(self, large_png):
|
def full_and_thumb_jpegs(self, large_png):
|
||||||
# these screenshots never have any alpha (right?)
|
# these screenshots never have any alpha (right?)
|
||||||
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
|
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
|
||||||
@ -424,12 +185,10 @@ class BrozzlerWorker:
|
|||||||
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
||||||
on_request=None, enable_youtube_dl=True):
|
on_request=None, enable_youtube_dl=True):
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
|
ydl_fetches = None
|
||||||
if enable_youtube_dl:
|
if enable_youtube_dl:
|
||||||
try:
|
try:
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
ydl_fetches = ydl.do_youtube_dl(self, site, page)
|
||||||
ydl = self._youtube_dl(tempdir, site)
|
|
||||||
ydl_spy = ydl.brozzler_spy # remember for later
|
|
||||||
self._try_youtube_dl(ydl, site, page)
|
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
raise
|
raise
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
@ -447,16 +206,14 @@ class BrozzlerWorker:
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
'youtube_dl raised exception on %s', page,
|
'youtube_dl raised exception on %s', page,
|
||||||
exc_info=True)
|
exc_info=True)
|
||||||
else:
|
|
||||||
ydl_spy = False
|
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl_spy):
|
if self._needs_browsing(page, ydl_fetches):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
outlinks = self._browse_page(browser, site, page, on_screenshot,
|
outlinks = self._browse_page(browser, site, page, on_screenshot,
|
||||||
on_request)
|
on_request)
|
||||||
return outlinks
|
return outlinks
|
||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl_spy):
|
if not self._already_fetched(page, ydl_fetches):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info('needs fetch: %s', page)
|
||||||
self._fetch_url(site, page)
|
self._fetch_url(site, page)
|
||||||
else:
|
else:
|
||||||
@ -550,9 +307,9 @@ class BrozzlerWorker:
|
|||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error fetching %s' % page.url) from e
|
'proxy error fetching %s' % page.url) from e
|
||||||
|
|
||||||
def _needs_browsing(self, page, brozzler_spy):
|
def _needs_browsing(self, page, ydl_fetches):
|
||||||
if brozzler_spy:
|
if ydl_fetches:
|
||||||
final_bounces = brozzler_spy.final_bounces(page.url)
|
final_bounces = ydl.final_bounces(ydl_fetches, page.url)
|
||||||
if not final_bounces:
|
if not final_bounces:
|
||||||
return True
|
return True
|
||||||
for txn in final_bounces:
|
for txn in final_bounces:
|
||||||
@ -563,9 +320,9 @@ class BrozzlerWorker:
|
|||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _already_fetched(self, page, brozzler_spy):
|
def _already_fetched(self, page, ydl_fetches):
|
||||||
if brozzler_spy:
|
if ydl_fetches:
|
||||||
for txn in brozzler_spy.final_bounces(page.url):
|
for txn in final_bounces(ydl_fetches, page.url):
|
||||||
if (txn['method'] == 'GET' and txn['status_code'] == 200):
|
if (txn['method'] == 'GET' and txn['status_code'] == 200):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@ -582,7 +339,7 @@ class BrozzlerWorker:
|
|||||||
# _proxy_for() call in log statement can raise brozzler.ProxyError
|
# _proxy_for() call in log statement can raise brozzler.ProxyError
|
||||||
# which is why we honor time limit and stop request first☝🏻
|
# which is why we honor time limit and stop request first☝🏻
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"brozzling site (proxy=%r) %r",
|
"brozzling site (proxy=%r) %s",
|
||||||
self._proxy_for(site), site)
|
self._proxy_for(site), site)
|
||||||
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
||||||
site.refresh()
|
site.refresh()
|
||||||
|
322
brozzler/ydl.py
Normal file
322
brozzler/ydl.py
Normal file
@ -0,0 +1,322 @@
|
|||||||
|
'''
|
||||||
|
brozzler/ydl.py - youtube-dl support for brozzler
|
||||||
|
|
||||||
|
This code was extracted from worker.py and
|
||||||
|
|
||||||
|
Copyright (C) 2018 Internet Archive
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import youtube_dl
|
||||||
|
import brozzler
|
||||||
|
import urllib.request
|
||||||
|
import tempfile
|
||||||
|
import urlcanon
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
|
||||||
|
def _webpage_read_content(self, *args, **kwargs):
|
||||||
|
content = _orig_webpage_read_content(self, *args, **kwargs)
|
||||||
|
if len(content) > 20000000:
|
||||||
|
logging.warn(
|
||||||
|
'bypassing youtube-dl extraction because content is '
|
||||||
|
'too large (%s characters)', len(content))
|
||||||
|
return ''
|
||||||
|
return content
|
||||||
|
youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content
|
||||||
|
|
||||||
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
|
def __init__(self, extra_headers):
|
||||||
|
self.extra_headers = extra_headers
|
||||||
|
self.http_request = self._http_request
|
||||||
|
self.https_request = self._http_request
|
||||||
|
|
||||||
|
def _http_request(self, req):
|
||||||
|
for h, v in self.extra_headers.items():
|
||||||
|
if h.capitalize() not in req.headers:
|
||||||
|
req.add_header(h, v)
|
||||||
|
return req
|
||||||
|
|
||||||
|
class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def _http_response(self, request, response):
|
||||||
|
fetch = {
|
||||||
|
'url': request.full_url,
|
||||||
|
'method': request.get_method(),
|
||||||
|
'response_code': response.code,
|
||||||
|
'response_headers': response.headers,
|
||||||
|
}
|
||||||
|
self.fetches.append(fetch)
|
||||||
|
return response
|
||||||
|
|
||||||
|
http_response = https_response = _http_response
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.fetches = []
|
||||||
|
|
||||||
|
def final_bounces(fetches, url):
|
||||||
|
"""
|
||||||
|
Resolves redirect chains in `fetches` and returns a list of fetches
|
||||||
|
representing the final redirect destinations of the given url. There could
|
||||||
|
be more than one if for example youtube-dl hit the same url with HEAD and
|
||||||
|
then GET requests.
|
||||||
|
"""
|
||||||
|
redirects = {}
|
||||||
|
for fetch in fetches:
|
||||||
|
# XXX check http status 301,302,303,307? check for "uri" header
|
||||||
|
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||||
|
if 'location' in fetch['response_headers']:
|
||||||
|
redirects[fetch['url']] = fetch
|
||||||
|
|
||||||
|
final_url = url
|
||||||
|
while final_url in redirects:
|
||||||
|
fetch = redirects.pop(final_url)
|
||||||
|
final_url = urllib.parse.urljoin(
|
||||||
|
fetch['url'], fetch['response_headers']['location'])
|
||||||
|
|
||||||
|
final_bounces = []
|
||||||
|
for fetch in fetches:
|
||||||
|
if fetch['url'] == final_url:
|
||||||
|
final_bounces.append(fetch)
|
||||||
|
|
||||||
|
return final_bounces
|
||||||
|
|
||||||
|
def _build_youtube_dl(worker, destdir, site):
|
||||||
|
'''
|
||||||
|
Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
|
||||||
|
|
||||||
|
The `YoutubeDL` instance does a few special brozzler-specific things:
|
||||||
|
|
||||||
|
- keeps track of urls fetched using a `YoutubeDLSpy`
|
||||||
|
- periodically updates `site.last_claimed` in rethinkdb
|
||||||
|
- if brozzling through warcprox and downloading fragmented (DASH) videos,
|
||||||
|
pushes the stitched together video to warcprox using a
|
||||||
|
WARCPROX_WRITE_RECORD request
|
||||||
|
|
||||||
|
Args:
|
||||||
|
worker (brozzler.BrozzlerWorker): the calling brozzler worker
|
||||||
|
destdir (str): where to save downloaded videos
|
||||||
|
site (brozzler.Site): the site we are brozzling
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
a `youtube_dl.YoutubeDL` instance
|
||||||
|
'''
|
||||||
|
|
||||||
|
class _YoutubeDL(youtube_dl.YoutubeDL):
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def add_default_extra_info(self, ie_result, ie, url):
|
||||||
|
# hook in some logging
|
||||||
|
super().add_default_extra_info(ie_result, ie, url)
|
||||||
|
if ie_result.get('_type') == 'playlist':
|
||||||
|
self.logger.info(
|
||||||
|
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||||
|
else:
|
||||||
|
self.logger.info(
|
||||||
|
'extractor %r found a video in %s', ie.IE_NAME, url)
|
||||||
|
|
||||||
|
def _push_stitched_up_vid_to_warcprox(self, site, info_dict, ctx):
|
||||||
|
try:
|
||||||
|
import magic
|
||||||
|
mimetype = magic.from_file(ctx['filename'], mime=True)
|
||||||
|
except ImportError as e:
|
||||||
|
mimetype = 'video/%s' % info_dict['ext']
|
||||||
|
self.logger.warn('guessing mimetype %s because %r', mimetype, e)
|
||||||
|
url = 'youtube-dl:%05d:%s' % (
|
||||||
|
info_dict.get('playlist_index') or 1,
|
||||||
|
info_dict['webpage_url'])
|
||||||
|
size = os.path.getsize(ctx['filename'])
|
||||||
|
self.logger.info(
|
||||||
|
'pushing %r video stitched-up as %s (%s bytes) to '
|
||||||
|
'warcprox at %s with url %s', info_dict['format'],
|
||||||
|
mimetype, size, worker._proxy_for(site), url)
|
||||||
|
with open(ctx['filename'], 'rb') as f:
|
||||||
|
# include content-length header to avoid chunked
|
||||||
|
# transfer, which warcprox currently rejects
|
||||||
|
request, response = worker._warcprox_write_record(
|
||||||
|
warcprox_address=worker._proxy_for(site), url=url,
|
||||||
|
warc_type='resource', content_type=mimetype, payload=f,
|
||||||
|
extra_headers={'content-length': size})
|
||||||
|
# consulted by _remember_videos()
|
||||||
|
self.stitch_ups.append({
|
||||||
|
'url': url,
|
||||||
|
'response_code': response.code,
|
||||||
|
'content-type': mimetype,
|
||||||
|
'content-length': size,
|
||||||
|
})
|
||||||
|
|
||||||
|
def process_info(self, info_dict):
|
||||||
|
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
|
||||||
|
|
||||||
|
def _finish_frag_download(ffd_self, ctx):
|
||||||
|
_orig__finish_frag_download(ffd_self, ctx)
|
||||||
|
if worker._using_warcprox(site):
|
||||||
|
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
|
||||||
|
|
||||||
|
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
|
||||||
|
return super().process_info(info_dict)
|
||||||
|
|
||||||
|
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
||||||
|
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
||||||
|
# to prevent another brozzler-worker from claiming the site
|
||||||
|
try:
|
||||||
|
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
|
||||||
|
worker.logger.debug(
|
||||||
|
'heartbeating site.last_claimed to prevent another '
|
||||||
|
'brozzler-worker claiming this site id=%r', site.id)
|
||||||
|
site.last_claimed = doublethink.utcnow()
|
||||||
|
site.save()
|
||||||
|
except:
|
||||||
|
worker.logger.debug(
|
||||||
|
'problem heartbeating site.last_claimed site id=%r',
|
||||||
|
site.id, exc_info=True)
|
||||||
|
|
||||||
|
ydl_opts = {
|
||||||
|
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
||||||
|
"verbose": False,
|
||||||
|
"retries": 1,
|
||||||
|
"logger": logging.getLogger("youtube_dl"),
|
||||||
|
"nocheckcertificate": True,
|
||||||
|
"hls_prefer_native": True,
|
||||||
|
"noprogress": True,
|
||||||
|
"nopart": True,
|
||||||
|
"no_color": True,
|
||||||
|
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
||||||
|
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
|
||||||
|
# "best: Select the best quality format represented by a single
|
||||||
|
# file with video and audio."
|
||||||
|
"format": "best/bestvideo+bestaudio",
|
||||||
|
}
|
||||||
|
if worker._proxy_for(site):
|
||||||
|
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
|
||||||
|
ydl = _YoutubeDL(ydl_opts)
|
||||||
|
if site.extra_headers():
|
||||||
|
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
||||||
|
ydl.fetch_spy = YoutubeDLSpy()
|
||||||
|
ydl.stitch_ups = []
|
||||||
|
ydl._opener.add_handler(ydl.fetch_spy)
|
||||||
|
return ydl
|
||||||
|
|
||||||
|
def _remember_videos(page, fetches, stitch_ups=None):
|
||||||
|
'''
|
||||||
|
Saves info about videos captured by youtube-dl in `page.videos`.
|
||||||
|
'''
|
||||||
|
if not 'videos' in page:
|
||||||
|
page.videos = []
|
||||||
|
for fetch in fetches or []:
|
||||||
|
content_type = fetch['response_headers'].get_content_type()
|
||||||
|
if (content_type.startswith('video/')
|
||||||
|
# skip manifests of DASH segmented video -
|
||||||
|
# see https://github.com/internetarchive/brozzler/pull/70
|
||||||
|
and content_type != 'video/vnd.mpeg.dash.mpd'
|
||||||
|
and fetch['method'] == 'GET'
|
||||||
|
and fetch['response_code'] in (200, 206)):
|
||||||
|
video = {
|
||||||
|
'blame': 'youtube-dl',
|
||||||
|
'url': fetch['url'],
|
||||||
|
'response_code': fetch['response_code'],
|
||||||
|
'content-type': content_type,
|
||||||
|
}
|
||||||
|
if 'content-length' in fetch['response_headers']:
|
||||||
|
video['content-length'] = int(
|
||||||
|
fetch['response_headers']['content-length'])
|
||||||
|
if 'content-range' in fetch['response_headers']:
|
||||||
|
video['content-range'] = fetch[
|
||||||
|
'response_headers']['content-range']
|
||||||
|
logging.debug('embedded video %s', video)
|
||||||
|
page.videos.append(video)
|
||||||
|
for stitch_up in stitch_ups or []:
|
||||||
|
if stitch_up['content-type'].startswith('video/'):
|
||||||
|
video = {
|
||||||
|
'blame': 'youtube-dl',
|
||||||
|
'url': stitch_up['url'],
|
||||||
|
'response_code': stitch_up['response_code'],
|
||||||
|
'content-type': stitch_up['content-type'],
|
||||||
|
'content-length': stitch_up['content-length'],
|
||||||
|
}
|
||||||
|
logging.debug('embedded video %s', video)
|
||||||
|
page.videos.append(video)
|
||||||
|
|
||||||
|
def _try_youtube_dl(worker, ydl, site, page):
|
||||||
|
try:
|
||||||
|
logging.info("trying youtube-dl on %s", page)
|
||||||
|
|
||||||
|
with brozzler.thread_accept_exceptions():
|
||||||
|
# we do whatwg canonicalization here to avoid "<urlopen error
|
||||||
|
# no host given>" resulting in ProxyError
|
||||||
|
# needs automated test
|
||||||
|
ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url)))
|
||||||
|
_remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
|
||||||
|
if worker._using_warcprox(site):
|
||||||
|
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||||
|
logging.info(
|
||||||
|
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||||
|
"with youtube-dl json for %s", page)
|
||||||
|
worker._warcprox_write_record(
|
||||||
|
warcprox_address=worker._proxy_for(site),
|
||||||
|
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||||
|
warc_type="metadata",
|
||||||
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
|
payload=info_json.encode("utf-8"),
|
||||||
|
extra_headers=site.extra_headers())
|
||||||
|
except brozzler.ShutdownRequested as e:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
||||||
|
pass
|
||||||
|
elif (hasattr(e, "exc_info")
|
||||||
|
and e.exc_info[0] == urllib.error.HTTPError
|
||||||
|
and hasattr(e.exc_info[1], "code")
|
||||||
|
and e.exc_info[1].code == 420):
|
||||||
|
raise brozzler.ReachedLimit(e.exc_info[1])
|
||||||
|
elif (hasattr(e, 'exc_info')
|
||||||
|
and e.exc_info[0] == urllib.error.URLError
|
||||||
|
and worker._proxy_for(site)):
|
||||||
|
# connection problem when using a proxy == proxy error (XXX?)
|
||||||
|
raise brozzler.ProxyError(
|
||||||
|
'youtube-dl hit apparent proxy error from '
|
||||||
|
'%s' % page.url) from e
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def do_youtube_dl(worker, site, page):
|
||||||
|
'''
|
||||||
|
Runs youtube-dl configured for `worker` and `site` to download videos from
|
||||||
|
`page`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
worker (brozzler.BrozzlerWorker): the calling brozzler worker
|
||||||
|
site (brozzler.Site): the site we are brozzling
|
||||||
|
page (brozzler.Page): the page we are brozzling
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`list` of `dict`: with info about urls fetched:
|
||||||
|
|
||||||
|
[{
|
||||||
|
'url': ...,
|
||||||
|
'method': ...,
|
||||||
|
'response_code': ...,
|
||||||
|
'response_headers': ...,
|
||||||
|
}, ...]
|
||||||
|
'''
|
||||||
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
|
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||||
|
_try_youtube_dl(worker, ydl, site, page)
|
||||||
|
return ydl.fetch_spy.fetches
|
Loading…
x
Reference in New Issue
Block a user