mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
save info about embedded videos in page document in rethinkdb
This commit is contained in:
parent
94ba56dca5
commit
13130bd9d9
@ -34,6 +34,7 @@ import requests
|
||||
import doublethink
|
||||
import tempfile
|
||||
import urlcanon
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
|
||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
def __init__(self, extra_headers):
|
||||
@ -198,10 +199,34 @@ class BrozzlerWorker:
|
||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||
e.getcode(), e.info())
|
||||
|
||||
def _remember_videos(self, page, ydl_spy):
|
||||
videos = []
|
||||
for txn in ydl_spy.transactions:
|
||||
if (txn['response_headers'].get_content_type().startswith('video/')
|
||||
and txn['method'] == 'GET'
|
||||
and txn['status_code'] in (200, 206)):
|
||||
video = {
|
||||
'blame': 'youtube-dl',
|
||||
'url': txn['url'],
|
||||
'response_code': txn['status_code'],
|
||||
'content-type': txn['response_headers'].get_content_type(),
|
||||
}
|
||||
if 'content-length' in txn['response_headers']:
|
||||
video['content-length'] = int(
|
||||
txn['response_headers']['content-length'])
|
||||
if 'content-range' in txn['response_headers']:
|
||||
video['content-range'] = txn[
|
||||
'response_headers']['content-range']
|
||||
logging.debug('embedded video %s', video)
|
||||
videos.append(video)
|
||||
page.videos = videos
|
||||
|
||||
def _try_youtube_dl(self, ydl, site, page):
|
||||
try:
|
||||
self.logger.info("trying youtube-dl on {}".format(page))
|
||||
info = ydl.extract_info(page.url)
|
||||
self._remember_videos(page, ydl.brozzler_spy)
|
||||
# logging.info('XXX %s', json.dumps(info))
|
||||
if self._proxy(site) and self._enable_warcprox_features(site):
|
||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||
self.logger.info(
|
||||
@ -243,6 +268,41 @@ class BrozzlerWorker:
|
||||
return full_jpeg, thumb_jpeg
|
||||
|
||||
def brozzle_page(self, browser, site, page, on_screenshot=None):
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = self._youtube_dl(tempdir, site)
|
||||
ydl_spy = ydl.brozzler_spy # remember for later
|
||||
self._try_youtube_dl(ydl, site, page)
|
||||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except Exception as e:
|
||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||
and hasattr(e.exc_info[1], 'code')
|
||||
and e.exc_info[1].code == 430):
|
||||
self.logger.info(
|
||||
'youtube-dl got %s %s processing %s',
|
||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||
else:
|
||||
self.logger.error(
|
||||
"youtube_dl raised exception on %s", page,
|
||||
exc_info=True)
|
||||
|
||||
if self._needs_browsing(page, ydl_spy):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
outlinks = self._browse_page(browser, site, page, on_screenshot)
|
||||
return outlinks
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_spy):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self._fetch_url(site, page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
return []
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None):
|
||||
def _on_screenshot(screenshot_png):
|
||||
if on_screenshot:
|
||||
on_screenshot(screenshot_png)
|
||||
@ -265,50 +325,39 @@ class BrozzlerWorker:
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = self._youtube_dl(tempdir, site)
|
||||
ydl_spy = ydl.brozzler_spy # remember for later
|
||||
self._try_youtube_dl(ydl, site, page)
|
||||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except Exception as e:
|
||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||
and hasattr(e.exc_info[1], 'code')
|
||||
and e.exc_info[1].code == 430):
|
||||
self.logger.info(
|
||||
'youtube-dl got %s %s processing %s',
|
||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||
else:
|
||||
self.logger.error(
|
||||
"youtube_dl raised exception on %s", page, exc_info=True)
|
||||
def _on_response(chrome_msg):
|
||||
if ('params' in chrome_msg
|
||||
and 'response' in chrome_msg['params']
|
||||
and 'mimeType' in chrome_msg['params']['response']
|
||||
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
|
||||
and chrome_msg['params']['response'].get('status') in (200, 206)):
|
||||
video = {
|
||||
'blame': 'browser',
|
||||
'url': chrome_msg['params']['response'].get('url'),
|
||||
'response_code': chrome_msg['params']['response']['status'],
|
||||
'content-type': chrome_msg['params']['response']['mimeType'],
|
||||
}
|
||||
response_headers = CaseInsensitiveDict(
|
||||
chrome_msg['params']['response']['headers'])
|
||||
if 'content-length' in response_headers:
|
||||
video['content-length'] = int(response_headers['content-length'])
|
||||
if 'content-range' in response_headers:
|
||||
video['content-range'] = response_headers['content-range']
|
||||
logging.debug('embedded video %s', video)
|
||||
page.videos.append(video)
|
||||
|
||||
if self._needs_browsing(page, ydl_spy):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
if not browser.is_running():
|
||||
browser.start(
|
||||
proxy=self._proxy(site),
|
||||
cookie_db=site.get('cookie_db'))
|
||||
proxy=self._proxy(site), cookie_db=site.get('cookie_db'))
|
||||
final_page_url, outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers(),
|
||||
behavior_parameters=site.get('behavior_parameters'),
|
||||
username=site.get('username'),
|
||||
password=site.get('password'),
|
||||
username=site.get('username'), password=site.get('password'),
|
||||
user_agent=site.get('user_agent'),
|
||||
on_screenshot=_on_screenshot)
|
||||
on_screenshot=_on_screenshot, on_response=_on_response)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_spy):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self._fetch_url(site, page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
return []
|
||||
|
||||
def _fetch_url(self, site, page):
|
||||
proxies = None
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev208',
|
||||
version='1.1b9.dev209',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -142,3 +142,34 @@ def test_js_dialogs(httpd):
|
||||
# browser.browse_page(
|
||||
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
|
||||
|
||||
def test_page_videos(httpd):
|
||||
# test depends on behavior of youtube-dl, could fail and need to be
|
||||
# adjusted on youtube-dl updates
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
worker = brozzler.BrozzlerWorker(None)
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
site = brozzler.Site(None, {})
|
||||
page = brozzler.Page(None, {
|
||||
'url':'http://localhost:%s/site6/' % httpd.server_port})
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
import pdb; pdb.set_trace()
|
||||
worker.brozzle_page(browser, site, page)
|
||||
assert page.videos
|
||||
assert len(page.videos) == 2
|
||||
assert page.videos[0] == {
|
||||
'blame': 'youtube-dl',
|
||||
'response_code': 200,
|
||||
'content-length': 383631,
|
||||
'content-type': 'video/mp4',
|
||||
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
|
||||
}
|
||||
assert page.videos[1] == {
|
||||
'blame': 'browser',
|
||||
# 'response_code': 206,
|
||||
# 'content-range': 'bytes 0-229454/229455',
|
||||
'response_code': 200,
|
||||
'content-length': 229455,
|
||||
'content-type': 'video/webm',
|
||||
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user