save info about embedded videos in page document in rethinkdb

This commit is contained in:
Noah Levitt 2017-03-20 11:49:11 -07:00
parent 94ba56dca5
commit 13130bd9d9
3 changed files with 124 additions and 44 deletions

View File

@ -34,6 +34,7 @@ import requests
import doublethink
import tempfile
import urlcanon
from requests.structures import CaseInsensitiveDict
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
@ -198,10 +199,34 @@ class BrozzlerWorker:
'WARCPROX_WRITE_RECORD request (expected 204)',
e.getcode(), e.info())
def _remember_videos(self, page, ydl_spy):
videos = []
for txn in ydl_spy.transactions:
if (txn['response_headers'].get_content_type().startswith('video/')
and txn['method'] == 'GET'
and txn['status_code'] in (200, 206)):
video = {
'blame': 'youtube-dl',
'url': txn['url'],
'response_code': txn['status_code'],
'content-type': txn['response_headers'].get_content_type(),
}
if 'content-length' in txn['response_headers']:
video['content-length'] = int(
txn['response_headers']['content-length'])
if 'content-range' in txn['response_headers']:
video['content-range'] = txn[
'response_headers']['content-range']
logging.debug('embedded video %s', video)
videos.append(video)
page.videos = videos
def _try_youtube_dl(self, ydl, site, page):
try:
self.logger.info("trying youtube-dl on {}".format(page))
info = ydl.extract_info(page.url)
self._remember_videos(page, ydl.brozzler_spy)
# logging.info('XXX %s', json.dumps(info))
if self._proxy(site) and self._enable_warcprox_features(site):
info_json = json.dumps(info, sort_keys=True, indent=4)
self.logger.info(
@ -243,6 +268,41 @@ class BrozzlerWorker:
return full_jpeg, thumb_jpeg
def brozzle_page(self, browser, site, page, on_screenshot=None):
self.logger.info("brozzling {}".format(page))
try:
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = self._youtube_dl(tempdir, site)
ydl_spy = ydl.brozzler_spy # remember for later
self._try_youtube_dl(ydl, site, page)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
raise
except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code')
and e.exc_info[1].code == 430):
self.logger.info(
'youtube-dl got %s %s processing %s',
e.exc_info[1].code, e.exc_info[1].msg, page.url)
else:
self.logger.error(
"youtube_dl raised exception on %s", page,
exc_info=True)
if self._needs_browsing(page, ydl_spy):
self.logger.info('needs browsing: %s', page)
outlinks = self._browse_page(browser, site, page, on_screenshot)
return outlinks
else:
if not self._already_fetched(page, ydl_spy):
self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page)
else:
self.logger.info('already fetched: %s', page)
return []
def _browse_page(self, browser, site, page, on_screenshot=None):
def _on_screenshot(screenshot_png):
if on_screenshot:
on_screenshot(screenshot_png)
@ -265,50 +325,39 @@ class BrozzlerWorker:
payload=thumbnail_jpeg,
extra_headers=site.extra_headers())
self.logger.info("brozzling {}".format(page))
try:
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = self._youtube_dl(tempdir, site)
ydl_spy = ydl.brozzler_spy # remember for later
self._try_youtube_dl(ydl, site, page)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
raise
except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code')
and e.exc_info[1].code == 430):
self.logger.info(
'youtube-dl got %s %s processing %s',
e.exc_info[1].code, e.exc_info[1].msg, page.url)
else:
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True)
def _on_response(chrome_msg):
if ('params' in chrome_msg
and 'response' in chrome_msg['params']
and 'mimeType' in chrome_msg['params']['response']
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
and chrome_msg['params']['response'].get('status') in (200, 206)):
video = {
'blame': 'browser',
'url': chrome_msg['params']['response'].get('url'),
'response_code': chrome_msg['params']['response']['status'],
'content-type': chrome_msg['params']['response']['mimeType'],
}
response_headers = CaseInsensitiveDict(
chrome_msg['params']['response']['headers'])
if 'content-length' in response_headers:
video['content-length'] = int(response_headers['content-length'])
if 'content-range' in response_headers:
video['content-range'] = response_headers['content-range']
logging.debug('embedded video %s', video)
page.videos.append(video)
if self._needs_browsing(page, ydl_spy):
self.logger.info('needs browsing: %s', page)
if not browser.is_running():
browser.start(
proxy=self._proxy(site),
cookie_db=site.get('cookie_db'))
proxy=self._proxy(site), cookie_db=site.get('cookie_db'))
final_page_url, outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(),
behavior_parameters=site.get('behavior_parameters'),
username=site.get('username'),
password=site.get('password'),
username=site.get('username'), password=site.get('password'),
user_agent=site.get('user_agent'),
on_screenshot=_on_screenshot)
on_screenshot=_on_screenshot, on_response=_on_response)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks
else:
if not self._already_fetched(page, ydl_spy):
self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page)
else:
self.logger.info('already fetched: %s', page)
return []
def _fetch_url(self, site, page):
proxies = None

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev208',
version='1.1b9.dev209',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -142,3 +142,34 @@ def test_js_dialogs(httpd):
# browser.browse_page(
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
def test_page_videos(httpd):
# test depends on behavior of youtube-dl, could fail and need to be
# adjusted on youtube-dl updates
chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None)
chrome_exe = brozzler.suggest_default_chrome_exe()
site = brozzler.Site(None, {})
page = brozzler.Page(None, {
'url':'http://localhost:%s/site6/' % httpd.server_port})
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
import pdb; pdb.set_trace()
worker.brozzle_page(browser, site, page)
assert page.videos
assert len(page.videos) == 2
assert page.videos[0] == {
'blame': 'youtube-dl',
'response_code': 200,
'content-length': 383631,
'content-type': 'video/mp4',
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
}
assert page.videos[1] == {
'blame': 'browser',
# 'response_code': 206,
# 'content-range': 'bytes 0-229454/229455',
'response_code': 200,
'content-length': 229455,
'content-type': 'video/webm',
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
}