mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 22:12:15 -04:00
Merge branch 'ari-5777' into qa
This commit is contained in:
commit
3c7fdeae2c
3 changed files with 63 additions and 2 deletions
|
@ -472,6 +472,7 @@ class Browser:
|
||||||
outlinks = []
|
outlinks = []
|
||||||
else:
|
else:
|
||||||
outlinks = self.extract_outlinks()
|
outlinks = self.extract_outlinks()
|
||||||
|
outlinks = outlinks.union(self.extract_tertiary_assets())
|
||||||
if not skip_visit_hashtags:
|
if not skip_visit_hashtags:
|
||||||
self.visit_hashtags(self.url(), hashtags, outlinks)
|
self.visit_hashtags(self.url(), hashtags, outlinks)
|
||||||
final_page_url = self.url()
|
final_page_url = self.url()
|
||||||
|
@ -567,6 +568,30 @@ class Browser:
|
||||||
'problem extracting outlinks, result message: %s', message)
|
'problem extracting outlinks, result message: %s', message)
|
||||||
return frozenset()
|
return frozenset()
|
||||||
|
|
||||||
|
def extract_tertiary_assets(self, timeout=60):
|
||||||
|
self.logger.info('extracting tertiary assets')
|
||||||
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
|
js = brozzler.jinja2_environment().get_template(
|
||||||
|
'extract-tertiary-assets.js').render()
|
||||||
|
msg_id = self.send_to_chrome(
|
||||||
|
method='Runtime.evaluate', params={'expression': js})
|
||||||
|
self._wait_for(
|
||||||
|
lambda: self.websock_thread.received_result(msg_id),
|
||||||
|
timeout=timeout)
|
||||||
|
message = self.websock_thread.pop_result(msg_id)
|
||||||
|
if ('result' in message and 'result' in message['result']
|
||||||
|
and 'value' in message['result']['result']):
|
||||||
|
if message['result']['result']['value']:
|
||||||
|
return frozenset(
|
||||||
|
message['result']['result']['value'].split('\n'))
|
||||||
|
else:
|
||||||
|
# no links found
|
||||||
|
return frozenset()
|
||||||
|
else:
|
||||||
|
self.logger.error(
|
||||||
|
'problem extracting tertiary assets, result message: %s', message)
|
||||||
|
return frozenset()
|
||||||
|
|
||||||
def screenshot(self, timeout=45):
|
def screenshot(self, timeout=45):
|
||||||
self.logger.info('taking screenshot')
|
self.logger.info('taking screenshot')
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
|
|
36
brozzler/js-templates/extract-tertiary-assets.js
Normal file
36
brozzler/js-templates/extract-tertiary-assets.js
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
// we have problems if the page has changed the definition of Set or Array
|
||||||
|
// http://www.polyvore.com/ does this for example
|
||||||
|
var __brzl_framesDone = new Set();
|
||||||
|
var __brzl_compileAssets = function(frame) {
|
||||||
|
__brzl_framesDone.add(frame);
|
||||||
|
if (frame && frame.document) {
|
||||||
|
var elem = frame.document.querySelectorAll('[srcset]');
|
||||||
|
var srcset_list = new Array();
|
||||||
|
var base = frame.document.baseURI.substring(0,
|
||||||
|
frame.document.baseURI.lastIndexOf("/"));
|
||||||
|
|
||||||
|
for (var i = 0; i < elem.length; i++) {
|
||||||
|
var srcs = elem[i].srcset.match(/(?:[^\s]+\/[^\s]+)/g);
|
||||||
|
|
||||||
|
for (var i = 0; i < srcs.length; i++) {
|
||||||
|
if ( /https?:/.test(srcs[i]) ) {
|
||||||
|
srcset_list = srcset_list.concat(srcs[i]);
|
||||||
|
} else {
|
||||||
|
srcset_list = srcset_list.concat(base + srcs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var assets = Array.prototype.slice.call(srcset_list);
|
||||||
|
|
||||||
|
for (var i = 0; i < frame.frames.length; i++) {
|
||||||
|
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
||||||
|
assets = assets.concat(
|
||||||
|
__brzl_compileAssets(frame.frames[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return assets;
|
||||||
|
}
|
||||||
|
__brzl_compileAssets(window).join('\n');
|
4
setup.py
4
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.5.dev302',
|
version='1.5.dev303',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -66,7 +66,6 @@ setuptools.setup(
|
||||||
'PyYAML>=3.12',
|
'PyYAML>=3.12',
|
||||||
'youtube-dl>=2018.7.21',
|
'youtube-dl>=2018.7.21',
|
||||||
'reppy==0.3.4',
|
'reppy==0.3.4',
|
||||||
'python-magic',
|
|
||||||
'requests>=2.18.4',
|
'requests>=2.18.4',
|
||||||
'websocket-client>=0.39.0,<=0.48.0',
|
'websocket-client>=0.39.0,<=0.48.0',
|
||||||
'pillow>=5.2.0',
|
'pillow>=5.2.0',
|
||||||
|
@ -76,6 +75,7 @@ setuptools.setup(
|
||||||
'cerberus>=1.0.1',
|
'cerberus>=1.0.1',
|
||||||
'jinja2>=2.10',
|
'jinja2>=2.10',
|
||||||
'cryptography>=2.3',
|
'cryptography>=2.3',
|
||||||
|
'python-magic>=0.4.15',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': [
|
'dashboard': [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue