mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
implement timeout and retries to work around issue where sometimes we receive no result message after requesting outlinks
This commit is contained in:
parent
be58fb46f7
commit
3bf3c80720
@ -209,6 +209,8 @@ class Browser:
|
|||||||
self._waiting_on_screenshot_msg_id = None
|
self._waiting_on_screenshot_msg_id = None
|
||||||
self._waiting_on_document_url_msg_id = None
|
self._waiting_on_document_url_msg_id = None
|
||||||
self._waiting_on_outlinks_msg_id = None
|
self._waiting_on_outlinks_msg_id = None
|
||||||
|
self._waiting_on_outlinks_start = None
|
||||||
|
self._waiting_on_outlinks_attempt = 0
|
||||||
self._outlinks = None
|
self._outlinks = None
|
||||||
self._reached_limit = None
|
self._reached_limit = None
|
||||||
self._aw_snap_hes_dead_jim = None
|
self._aw_snap_hes_dead_jim = None
|
||||||
@ -255,8 +257,8 @@ class Browser:
|
|||||||
def _post_behavior_interval_func(self):
|
def _post_behavior_interval_func(self):
|
||||||
"""Called periodically after behavior is finished on the page. Returns
|
"""Called periodically after behavior is finished on the page. Returns
|
||||||
true when post-behavior tasks are finished."""
|
true when post-behavior tasks are finished."""
|
||||||
if not self._has_screenshot and (
|
if (not self._has_screenshot
|
||||||
not self._waiting_on_scroll_to_top_msg_id
|
and not self._waiting_on_scroll_to_top_msg_id
|
||||||
and not self._waiting_on_screenshot_msg_id):
|
and not self._waiting_on_screenshot_msg_id):
|
||||||
if time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
|
if time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@ -275,7 +277,7 @@ class Browser:
|
|||||||
self._waiting_on_scroll_to_top_start = time.time()
|
self._waiting_on_scroll_to_top_start = time.time()
|
||||||
return False
|
return False
|
||||||
elif (self._waiting_on_scroll_to_top_msg_id
|
elif (self._waiting_on_scroll_to_top_msg_id
|
||||||
and time.time() - self._waiting_on_scroll_to_top_start > 30):
|
and time.time() - self._waiting_on_scroll_to_top_start > 30.0):
|
||||||
# chromium bug? occasionally we get no scroll-to-top result message
|
# chromium bug? occasionally we get no scroll-to-top result message
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"timed out after %.1fs waiting for scroll-to-top result "
|
"timed out after %.1fs waiting for scroll-to-top result "
|
||||||
@ -295,12 +297,33 @@ class Browser:
|
|||||||
return True
|
return True
|
||||||
elif not self._waiting_on_outlinks_msg_id:
|
elif not self._waiting_on_outlinks_msg_id:
|
||||||
self.logger.info("retrieving outlinks for %s", self.url)
|
self.logger.info("retrieving outlinks for %s", self.url)
|
||||||
|
self._request_outlinks()
|
||||||
|
return False
|
||||||
|
else: # self._waiting_on_outlinks_msg_id
|
||||||
|
if time.time() - self._waiting_on_outlinks_start > 30.0:
|
||||||
|
if self._waiting_on_outlinks_attempt < 5:
|
||||||
|
self.logger.warn(
|
||||||
|
"timed out after %.1fs on attempt %s to retrieve "
|
||||||
|
"outlinks, trying again",
|
||||||
|
time.time() - self._waiting_on_outlinks_start,
|
||||||
|
self._waiting_on_outlinks_attempt)
|
||||||
|
self._request_outlinks()
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
raise BrowsingException(
|
||||||
|
"timed out after %.1fs on (final) attempt %s "
|
||||||
|
"to retrieve outlinks" % (
|
||||||
|
time.time() - self._waiting_on_outlinks_start,
|
||||||
|
self._waiting_on_outlinks_attempt))
|
||||||
|
else: # just waiting for outlinks
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _request_outlinks(self):
|
||||||
self._waiting_on_outlinks_msg_id = self.send_to_chrome(
|
self._waiting_on_outlinks_msg_id = self.send_to_chrome(
|
||||||
method="Runtime.evaluate",
|
method="Runtime.evaluate",
|
||||||
params={"expression": self.OUTLINKS_JS})
|
params={"expression": self.OUTLINKS_JS})
|
||||||
return False
|
self._waiting_on_outlinks_attempt += 1
|
||||||
else: # self._waiting_on_outlinks_msg_id
|
self._waiting_on_outlinks_start = time.time()
|
||||||
return False
|
|
||||||
|
|
||||||
OUTLINKS_JS = """
|
OUTLINKS_JS = """
|
||||||
var compileOutlinks = function(frame) {
|
var compileOutlinks = function(frame) {
|
||||||
@ -319,10 +342,14 @@ compileOutlinks(window).join(' ');
|
|||||||
def _browse_interval_func(self):
|
def _browse_interval_func(self):
|
||||||
"""Called periodically while page is being browsed. Returns True when
|
"""Called periodically while page is being browsed. Returns True when
|
||||||
finished browsing."""
|
finished browsing."""
|
||||||
if not self._websock or not self._websock.sock or not self._websock.sock.connected:
|
if (not self._websock or not self._websock.sock
|
||||||
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
|
or not self._websock.sock.connected):
|
||||||
|
raise BrowsingException(
|
||||||
|
"websocket closed, did chrome die? {}".format(
|
||||||
|
self._websocket_url))
|
||||||
elif self._aw_snap_hes_dead_jim:
|
elif self._aw_snap_hes_dead_jim:
|
||||||
raise BrowsingException("""chrome tab went "aw snap" or "he's dead jim"!""")
|
raise BrowsingException(
|
||||||
|
"""chrome tab went "aw snap" or "he's dead jim"!""")
|
||||||
elif (self._behavior != None and self._behavior.is_finished()
|
elif (self._behavior != None and self._behavior.is_finished()
|
||||||
or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS):
|
or time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS):
|
||||||
return True
|
return True
|
||||||
|
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import setuptools
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b3.dev48',
|
version='1.1b3.dev49',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user