Merge branch 'ari-5777' into qa

This commit is contained in:
Neil Minton 2018-09-12 12:07:45 -04:00
commit 3c7fdeae2c
3 changed files with 63 additions and 2 deletions

View File

@ -472,6 +472,7 @@ class Browser:
outlinks = []
else:
outlinks = self.extract_outlinks()
outlinks = outlinks.union(self.extract_tertiary_assets())
if not skip_visit_hashtags:
self.visit_hashtags(self.url(), hashtags, outlinks)
final_page_url = self.url()
@ -567,6 +568,30 @@ class Browser:
'problem extracting outlinks, result message: %s', message)
return frozenset()
def extract_tertiary_assets(self, timeout=60):
self.logger.info('extracting tertiary assets')
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template(
'extract-tertiary-assets.js').render()
msg_id = self.send_to_chrome(
method='Runtime.evaluate', params={'expression': js})
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
message = self.websock_thread.pop_result(msg_id)
if ('result' in message and 'result' in message['result']
and 'value' in message['result']['result']):
if message['result']['result']['value']:
return frozenset(
message['result']['result']['value'].split('\n'))
else:
# no links found
return frozenset()
else:
self.logger.error(
'problem extracting tertiary assets, result message: %s', message)
return frozenset()
def screenshot(self, timeout=45):
self.logger.info('taking screenshot')
self.websock_thread.expect_result(self._command_id.peek())

View File

@ -0,0 +1,36 @@
// we have problems if the page has changed the definition of Set or Array
// http://www.polyvore.com/ does this for example
var __brzl_framesDone = new Set();
var __brzl_compileAssets = function(frame) {
__brzl_framesDone.add(frame);
if (frame && frame.document) {
var elem = frame.document.querySelectorAll('[srcset]');
var srcset_list = new Array();
var base = frame.document.baseURI.substring(0,
frame.document.baseURI.lastIndexOf("/"));
for (var i = 0; i < elem.length; i++) {
var srcs = elem[i].srcset.match(/(?:[^\s]+\/[^\s]+)/g);
for (var i = 0; i < srcs.length; i++) {
if ( /https?:/.test(srcs[i]) ) {
srcset_list = srcset_list.concat(srcs[i]);
} else {
srcset_list = srcset_list.concat(base + srcs[i]);
}
}
}
var assets = Array.prototype.slice.call(srcset_list);
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
assets = assets.concat(
__brzl_compileAssets(frame.frames[i]));
}
}
}
return assets;
}
__brzl_compileAssets(window).join('\n');

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.dev302',
version='1.5.dev303',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -66,7 +66,6 @@ setuptools.setup(
'PyYAML>=3.12',
'youtube-dl>=2018.7.21',
'reppy==0.3.4',
'python-magic',
'requests>=2.18.4',
'websocket-client>=0.39.0,<=0.48.0',
'pillow>=5.2.0',
@ -76,6 +75,7 @@ setuptools.setup(
'cerberus>=1.0.1',
'jinja2>=2.10',
'cryptography>=2.3',
'python-magic>=0.4.15',
],
extras_require={
'dashboard': [