diff --git a/brozzler/browser.py b/brozzler/browser.py index c0cb3ee..4a8f608 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -356,13 +356,12 @@ class Browser: # tell browser to send us messages we're interested in self.send_to_chrome(method='Network.enable') self.send_to_chrome(method='Page.enable') - self.send_to_chrome(method='Console.enable') - self.send_to_chrome(method='Runtime.enable') - # Network.requestIntercepted needs more work... - #self.send_to_chrome( - # method='Network.setRequestInterception', - # params={'patterns': [{'urlPattern': '*'}]}) - + # Enable Console & Runtime output only when debugging. + # After all, we just print these events with debug(), we don't use + # them in Brozzler logic. + if self.logger.isEnabledFor(logging.DEBUG): + self.send_to_chrome(method='Console.enable') + self.send_to_chrome(method='Runtime.enable') self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') @@ -661,6 +660,7 @@ class Browser: method='Runtime.evaluate', suppress_logging=True, params={'expression': behavior_script}) + check_interval = min(timeout, 7) start = time.time() while True: elapsed = time.time() - start @@ -669,7 +669,7 @@ class Browser: 'behavior reached hard timeout after %.1fs', elapsed) return - brozzler.sleep(7) + brozzler.sleep(check_interval) self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( diff --git a/brozzler/chrome.py b/brozzler/chrome.py index c70296f..cbca3e5 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -250,10 +250,16 @@ class Chrome: # XXX select doesn't work on windows def readline_nonblock(f): buf = b'' - while not self._shutdown.is_set() and ( + try: + while not self._shutdown.is_set() and ( len(buf) == 0 or buf[-1] != 0xa) and select.select( [f],[],[],0.5)[0]: - buf += f.read(1) + buf += f.read(1) + except (ValueError, OSError): + # When the chrome process crashes, stdout & stderr are closed + # and trying to read from them raises these exceptions. We just + # stop reading and return current `buf`. + pass return buf try: diff --git a/brozzler/js-templates/umbraBehavior.js.j2 b/brozzler/js-templates/umbraBehavior.js.j2 index 172cdfd..7931a62 100644 --- a/brozzler/js-templates/umbraBehavior.js.j2 +++ b/brozzler/js-templates/umbraBehavior.js.j2 @@ -43,6 +43,7 @@ class UmbraBehavior { var documents = []; documents[0] = document; + var iframes = document.querySelectorAll("iframe"); var iframesLength = iframes.length; for (var i = 0; i < iframesLength; i++) { @@ -54,23 +55,31 @@ class UmbraBehavior { // console.log("exception looking at iframe" + iframes[i] + ": " + e); } } + var documentsLength = documents.length; for (var j = 0; j < documentsLength; j++) { if (closeSelector) { var closeTargets = documents[j].querySelectorAll(closeSelector); for (var i = 0; i < closeTargets.length; i++) { - this.doTarget(closeTargets[i], "click"); + if (this.isVisible(closeTargets[i])) { + closeTargets[i].click(); + didSomething = true; + break; + } } } + if (firstMatchOnly) { var doTargets = [ documents[j].querySelector(selector) ]; } else { var doTargets = documents[j].querySelectorAll(selector); } + var doTargetsLength = doTargets.length; if (!(doTargetsLength > 0)) { continue; } + for ( var i = 0; i < doTargetsLength; i++) { if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) { continue; diff --git a/setup.py b/setup.py index 27eb4b6..ef5d945 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.7', + version='1.5.8', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',