From 9fd78fdbe8c1d974f8f457eaa41d199e01d72f02 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 30 Jun 2016 11:45:19 -0500 Subject: [PATCH] implement timeout to work around issue where sometimes we receive no result message after requesting scroll to top --- brozzler/browser.py | 17 ++++++++++++++++- setup.py | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 474afd7..bde1376 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -204,6 +204,7 @@ class Browser: self.behavior_parameters = behavior_parameters self._waiting_on_scroll_to_top_msg_id = None + self._waiting_on_scroll_to_top_start = None self._waiting_on_screenshot_msg_id = None self._waiting_on_document_url_msg_id = None self._waiting_on_outlinks_msg_id = None @@ -270,7 +271,19 @@ class Browser: self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome( method="Runtime.evaluate", params={"expression":"window.scrollTo(0, 0);"}) + self._waiting_on_scroll_to_top_start = time.time() return False + elif (self._waiting_on_scroll_to_top_msg_id + and time.time() - self._waiting_on_scroll_to_top_start > 30): + # chromium bug? occasionally we get no scroll-to-top result message + self.logger.warn( + "timed out after %.1fs waiting for scroll-to-top result " + "message, requesting screenshot now", + time.time() - self._waiting_on_scroll_to_top_start) + self._waiting_on_scroll_to_top_msg_id = None + self._waiting_on_scroll_to_top_start = None + self._waiting_on_screenshot_msg_id = self.send_to_chrome( + method="Page.captureScreenshot") elif not self._has_screenshot and ( self._waiting_on_scroll_to_top_msg_id or self._waiting_on_screenshot_msg_id): @@ -409,8 +422,10 @@ compileOutlinks(window).join(' '); self._has_screenshot = True self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url)) elif message["id"] == self._waiting_on_scroll_to_top_msg_id: - self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot") self._waiting_on_scroll_to_top_msg_id = None + self._waiting_on_scroll_to_top_start = None + self._waiting_on_screenshot_msg_id = self.send_to_chrome( + method="Page.captureScreenshot") elif message["id"] == self._waiting_on_outlinks_msg_id: self.logger.debug("got outlinks message=%s", message) self._outlinks = frozenset( diff --git a/setup.py b/setup.py index 6bbfb71..93d3cfb 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1.dev37', + version='1.1.dev38', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',