handle errors from extract-outlinks.js, which happens on polyvore.com because it changes the definition of Set 😭

This commit is contained in:
Noah Levitt 2017-02-22 10:57:11 -08:00
parent 0d0da22613
commit 3c4ab834da
3 changed files with 14 additions and 4 deletions

View file

@ -1,7 +1,7 @@
''' '''
brozzler/browser.py - manages the browsers for brozzler brozzler/browser.py - manages the browsers for brozzler
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -485,9 +485,17 @@ class Browser:
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id),
timeout=timeout) timeout=timeout)
message = self.websock_thread.pop_result(msg_id) message = self.websock_thread.pop_result(msg_id)
if message['result']['result']['value']: if ('result' in message and 'result' in message['result']
return frozenset(message['result']['result']['value'].split('\n')) and 'value' in message['result']['result']):
if message['result']['result']['value']:
return frozenset(
message['result']['result']['value'].split('\n'))
else:
# no links found
return frozenset()
else: else:
self.logger.error(
'problem extracting outlinks, result message: %s', message)
return frozenset() return frozenset()
def screenshot(self, timeout=30): def screenshot(self, timeout=30):

View file

@ -1,3 +1,5 @@
// we have problems if the page has changed the definition of Set or Array
// http://www.polyvore.com/ does this for example
var __brzl_framesDone = new Set(); var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) { var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame); __brzl_framesDone.add(frame);

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev191', version='1.1b9.dev192',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',