mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 21:04:24 -04:00
save info about embedded videos in page document in rethinkdb
This commit is contained in:
parent
94ba56dca5
commit
13130bd9d9
3 changed files with 124 additions and 44 deletions
|
@ -34,6 +34,7 @@ import requests
|
||||||
import doublethink
|
import doublethink
|
||||||
import tempfile
|
import tempfile
|
||||||
import urlcanon
|
import urlcanon
|
||||||
|
from requests.structures import CaseInsensitiveDict
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
|
@ -198,10 +199,34 @@ class BrozzlerWorker:
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||||
e.getcode(), e.info())
|
e.getcode(), e.info())
|
||||||
|
|
||||||
|
def _remember_videos(self, page, ydl_spy):
|
||||||
|
videos = []
|
||||||
|
for txn in ydl_spy.transactions:
|
||||||
|
if (txn['response_headers'].get_content_type().startswith('video/')
|
||||||
|
and txn['method'] == 'GET'
|
||||||
|
and txn['status_code'] in (200, 206)):
|
||||||
|
video = {
|
||||||
|
'blame': 'youtube-dl',
|
||||||
|
'url': txn['url'],
|
||||||
|
'response_code': txn['status_code'],
|
||||||
|
'content-type': txn['response_headers'].get_content_type(),
|
||||||
|
}
|
||||||
|
if 'content-length' in txn['response_headers']:
|
||||||
|
video['content-length'] = int(
|
||||||
|
txn['response_headers']['content-length'])
|
||||||
|
if 'content-range' in txn['response_headers']:
|
||||||
|
video['content-range'] = txn[
|
||||||
|
'response_headers']['content-range']
|
||||||
|
logging.debug('embedded video %s', video)
|
||||||
|
videos.append(video)
|
||||||
|
page.videos = videos
|
||||||
|
|
||||||
def _try_youtube_dl(self, ydl, site, page):
|
def _try_youtube_dl(self, ydl, site, page):
|
||||||
try:
|
try:
|
||||||
self.logger.info("trying youtube-dl on {}".format(page))
|
self.logger.info("trying youtube-dl on {}".format(page))
|
||||||
info = ydl.extract_info(page.url)
|
info = ydl.extract_info(page.url)
|
||||||
|
self._remember_videos(page, ydl.brozzler_spy)
|
||||||
|
# logging.info('XXX %s', json.dumps(info))
|
||||||
if self._proxy(site) and self._enable_warcprox_features(site):
|
if self._proxy(site) and self._enable_warcprox_features(site):
|
||||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
@ -243,6 +268,41 @@ class BrozzlerWorker:
|
||||||
return full_jpeg, thumb_jpeg
|
return full_jpeg, thumb_jpeg
|
||||||
|
|
||||||
def brozzle_page(self, browser, site, page, on_screenshot=None):
|
def brozzle_page(self, browser, site, page, on_screenshot=None):
|
||||||
|
self.logger.info("brozzling {}".format(page))
|
||||||
|
try:
|
||||||
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
|
ydl = self._youtube_dl(tempdir, site)
|
||||||
|
ydl_spy = ydl.brozzler_spy # remember for later
|
||||||
|
self._try_youtube_dl(ydl, site, page)
|
||||||
|
except brozzler.ReachedLimit as e:
|
||||||
|
raise
|
||||||
|
except brozzler.ShutdownRequested:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||||
|
and hasattr(e.exc_info[1], 'code')
|
||||||
|
and e.exc_info[1].code == 430):
|
||||||
|
self.logger.info(
|
||||||
|
'youtube-dl got %s %s processing %s',
|
||||||
|
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||||
|
else:
|
||||||
|
self.logger.error(
|
||||||
|
"youtube_dl raised exception on %s", page,
|
||||||
|
exc_info=True)
|
||||||
|
|
||||||
|
if self._needs_browsing(page, ydl_spy):
|
||||||
|
self.logger.info('needs browsing: %s', page)
|
||||||
|
outlinks = self._browse_page(browser, site, page, on_screenshot)
|
||||||
|
return outlinks
|
||||||
|
else:
|
||||||
|
if not self._already_fetched(page, ydl_spy):
|
||||||
|
self.logger.info('needs fetch: %s', page)
|
||||||
|
self._fetch_url(site, page)
|
||||||
|
else:
|
||||||
|
self.logger.info('already fetched: %s', page)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _browse_page(self, browser, site, page, on_screenshot=None):
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_png)
|
||||||
|
@ -265,50 +325,39 @@ class BrozzlerWorker:
|
||||||
payload=thumbnail_jpeg,
|
payload=thumbnail_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
|
||||||
self.logger.info("brozzling {}".format(page))
|
def _on_response(chrome_msg):
|
||||||
try:
|
if ('params' in chrome_msg
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
and 'response' in chrome_msg['params']
|
||||||
ydl = self._youtube_dl(tempdir, site)
|
and 'mimeType' in chrome_msg['params']['response']
|
||||||
ydl_spy = ydl.brozzler_spy # remember for later
|
and chrome_msg['params']['response'].get('mimeType', '').startswith('video/')
|
||||||
self._try_youtube_dl(ydl, site, page)
|
and chrome_msg['params']['response'].get('status') in (200, 206)):
|
||||||
except brozzler.ReachedLimit as e:
|
video = {
|
||||||
raise
|
'blame': 'browser',
|
||||||
except brozzler.ShutdownRequested:
|
'url': chrome_msg['params']['response'].get('url'),
|
||||||
raise
|
'response_code': chrome_msg['params']['response']['status'],
|
||||||
except Exception as e:
|
'content-type': chrome_msg['params']['response']['mimeType'],
|
||||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
}
|
||||||
and hasattr(e.exc_info[1], 'code')
|
response_headers = CaseInsensitiveDict(
|
||||||
and e.exc_info[1].code == 430):
|
chrome_msg['params']['response']['headers'])
|
||||||
self.logger.info(
|
if 'content-length' in response_headers:
|
||||||
'youtube-dl got %s %s processing %s',
|
video['content-length'] = int(response_headers['content-length'])
|
||||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
if 'content-range' in response_headers:
|
||||||
else:
|
video['content-range'] = response_headers['content-range']
|
||||||
self.logger.error(
|
logging.debug('embedded video %s', video)
|
||||||
"youtube_dl raised exception on %s", page, exc_info=True)
|
page.videos.append(video)
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl_spy):
|
if not browser.is_running():
|
||||||
self.logger.info('needs browsing: %s', page)
|
browser.start(
|
||||||
if not browser.is_running():
|
proxy=self._proxy(site), cookie_db=site.get('cookie_db'))
|
||||||
browser.start(
|
final_page_url, outlinks = browser.browse_page(
|
||||||
proxy=self._proxy(site),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
cookie_db=site.get('cookie_db'))
|
behavior_parameters=site.get('behavior_parameters'),
|
||||||
final_page_url, outlinks = browser.browse_page(
|
username=site.get('username'), password=site.get('password'),
|
||||||
page.url, extra_headers=site.extra_headers(),
|
user_agent=site.get('user_agent'),
|
||||||
behavior_parameters=site.get('behavior_parameters'),
|
on_screenshot=_on_screenshot, on_response=_on_response)
|
||||||
username=site.get('username'),
|
if final_page_url != page.url:
|
||||||
password=site.get('password'),
|
page.note_redirect(final_page_url)
|
||||||
user_agent=site.get('user_agent'),
|
return outlinks
|
||||||
on_screenshot=_on_screenshot)
|
|
||||||
if final_page_url != page.url:
|
|
||||||
page.note_redirect(final_page_url)
|
|
||||||
return outlinks
|
|
||||||
else:
|
|
||||||
if not self._already_fetched(page, ydl_spy):
|
|
||||||
self.logger.info('needs fetch: %s', page)
|
|
||||||
self._fetch_url(site, page)
|
|
||||||
else:
|
|
||||||
self.logger.info('already fetched: %s', page)
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _fetch_url(self, site, page):
|
def _fetch_url(self, site, page):
|
||||||
proxies = None
|
proxies = None
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev208',
|
version='1.1b9.dev209',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -142,3 +142,34 @@ def test_js_dialogs(httpd):
|
||||||
# browser.browse_page(
|
# browser.browse_page(
|
||||||
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
|
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
|
||||||
|
|
||||||
|
def test_page_videos(httpd):
|
||||||
|
# test depends on behavior of youtube-dl, could fail and need to be
|
||||||
|
# adjusted on youtube-dl updates
|
||||||
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
|
worker = brozzler.BrozzlerWorker(None)
|
||||||
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
|
site = brozzler.Site(None, {})
|
||||||
|
page = brozzler.Page(None, {
|
||||||
|
'url':'http://localhost:%s/site6/' % httpd.server_port})
|
||||||
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
|
import pdb; pdb.set_trace()
|
||||||
|
worker.brozzle_page(browser, site, page)
|
||||||
|
assert page.videos
|
||||||
|
assert len(page.videos) == 2
|
||||||
|
assert page.videos[0] == {
|
||||||
|
'blame': 'youtube-dl',
|
||||||
|
'response_code': 200,
|
||||||
|
'content-length': 383631,
|
||||||
|
'content-type': 'video/mp4',
|
||||||
|
'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
|
||||||
|
}
|
||||||
|
assert page.videos[1] == {
|
||||||
|
'blame': 'browser',
|
||||||
|
# 'response_code': 206,
|
||||||
|
# 'content-range': 'bytes 0-229454/229455',
|
||||||
|
'response_code': 200,
|
||||||
|
'content-length': 229455,
|
||||||
|
'content-type': 'video/webm',
|
||||||
|
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue