diff --git a/brozzler/browser.py b/brozzler/browser.py index a7d39d1..f2f2631 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -470,6 +470,7 @@ class Browser: outlinks = [] else: outlinks = self.extract_outlinks() + outlinks = outlinks.union(self.extract_tertiary_assets()) if not skip_visit_hashtags: self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() @@ -565,6 +566,30 @@ class Browser: 'problem extracting outlinks, result message: %s', message) return frozenset() + def extract_tertiary_assets(self, timeout=60): + self.logger.info('extracting tertiary assets') + self.websock_thread.expect_result(self._command_id.peek()) + js = brozzler.jinja2_environment().get_template( + 'extract-tertiary-assets.js').render() + msg_id = self.send_to_chrome( + method='Runtime.evaluate', params={'expression': js}) + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=timeout) + message = self.websock_thread.pop_result(msg_id) + if ('result' in message and 'result' in message['result'] + and 'value' in message['result']['result']): + if message['result']['result']['value']: + return frozenset( + message['result']['result']['value'].split('\n')) + else: + # no links found + return frozenset() + else: + self.logger.error( + 'problem extracting tertiary assets, result message: %s', message) + return frozenset() + def screenshot(self, timeout=45): self.logger.info('taking screenshot') self.websock_thread.expect_result(self._command_id.peek()) diff --git a/brozzler/js-templates/extract-tertiary-assets.js b/brozzler/js-templates/extract-tertiary-assets.js new file mode 100644 index 0000000..ccab482 --- /dev/null +++ b/brozzler/js-templates/extract-tertiary-assets.js @@ -0,0 +1,36 @@ +// we have problems if the page has changed the definition of Set or Array +// http://www.polyvore.com/ does this for example +var __brzl_framesDone = new Set(); +var __brzl_compileAssets = function(frame) { + __brzl_framesDone.add(frame); + if (frame && frame.document) { + var elem = frame.document.querySelectorAll('[srcset]'); + var srcset_list = new Array(); + var base = frame.document.baseURI.substring(0, + frame.document.baseURI.lastIndexOf("/")); + + for (var i = 0; i < elem.length; i++) { + var srcs = elem[i].srcset.match(/(?:[^\s]+\/[^\s]+)/g); + + for (var i = 0; i < srcs.length; i++) { + if ( /https?:/.test(srcs[i]) ) { + srcset_list = srcset_list.concat(srcs[i]); + } else { + srcset_list = srcset_list.concat(base + srcs[i]); + } + } + } + + var assets = Array.prototype.slice.call(srcset_list); + + for (var i = 0; i < frame.frames.length; i++) { + if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { + assets = assets.concat( + __brzl_compileAssets(frame.frames[i])); + } + } + } + + return assets; +} +__brzl_compileAssets(window).join('\n');