Extract srcset values for use in crawling.

This commit is contained in:
Neil Minton 2018-09-12 12:04:47 -04:00
parent efb0696833
commit b5c213cef9
2 changed files with 61 additions and 0 deletions

View File

@ -470,6 +470,7 @@ class Browser:
outlinks = []
else:
outlinks = self.extract_outlinks()
outlinks = outlinks.union(self.extract_tertiary_assets())
if not skip_visit_hashtags:
self.visit_hashtags(self.url(), hashtags, outlinks)
final_page_url = self.url()
@ -565,6 +566,30 @@ class Browser:
'problem extracting outlinks, result message: %s', message)
return frozenset()
def extract_tertiary_assets(self, timeout=60):
self.logger.info('extracting tertiary assets')
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template(
'extract-tertiary-assets.js').render()
msg_id = self.send_to_chrome(
method='Runtime.evaluate', params={'expression': js})
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
message = self.websock_thread.pop_result(msg_id)
if ('result' in message and 'result' in message['result']
and 'value' in message['result']['result']):
if message['result']['result']['value']:
return frozenset(
message['result']['result']['value'].split('\n'))
else:
# no links found
return frozenset()
else:
self.logger.error(
'problem extracting tertiary assets, result message: %s', message)
return frozenset()
def screenshot(self, timeout=45):
self.logger.info('taking screenshot')
self.websock_thread.expect_result(self._command_id.peek())

View File

@ -0,0 +1,36 @@
// we have problems if the page has changed the definition of Set or Array
// http://www.polyvore.com/ does this for example
var __brzl_framesDone = new Set();
var __brzl_compileAssets = function(frame) {
__brzl_framesDone.add(frame);
if (frame && frame.document) {
var elem = frame.document.querySelectorAll('[srcset]');
var srcset_list = new Array();
var base = frame.document.baseURI.substring(0,
frame.document.baseURI.lastIndexOf("/"));
for (var i = 0; i < elem.length; i++) {
var srcs = elem[i].srcset.match(/(?:[^\s]+\/[^\s]+)/g);
for (var i = 0; i < srcs.length; i++) {
if ( /https?:/.test(srcs[i]) ) {
srcset_list = srcset_list.concat(srcs[i]);
} else {
srcset_list = srcset_list.concat(base + srcs[i]);
}
}
}
var assets = Array.prototype.slice.call(srcset_list);
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
assets = assets.concat(
__brzl_compileAssets(frame.frames[i]));
}
}
}
return assets;
}
__brzl_compileAssets(window).join('\n');