try taking screenshot 3 times, proceed on failure

We've been seeing a lot of this:

2018-02-14 20:06:01,472 13286 CRITICAL BrozzlingThread:44789 brozzler.worker.BrozzlerWorker.brozzle_site(worker.py:559) unexpected exception
Traceback (most recent call last):
  File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 528, in brozzle_site
    enable_youtube_dl=not self._skip_youtube_dl)
  File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 385, in brozzle_page
    on_request)
  File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 459, in _browse_page
    behavior_timeout=self._behavior_timeout)
  File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/browser.py", line 463, in browse_page
    jpeg_bytes = self.screenshot()
  File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/browser.py", line 565, in screenshot
    timeout=timeout)
  File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/browser.py", line 311, in _wait_for
    elapsed, callback))
brozzler.browser.BrowsingTimeout: timed out after 90.5s waiting for: <function Browser.screenshot.<locals>.<lambda> at 0x7f5ab0076a68>

Browser bug, maybe? To work around it, reduce timeout to 45 seconds, try
getting screenshot 3 times, and if it fails proceed anyway, don't queue
the page for recrawling.
This commit is contained in:
Noah Levitt 2018-02-14 12:15:48 -08:00
parent 0faeaab3ac
commit b38fbdcda6
2 changed files with 13 additions and 5 deletions

View File

@ -1,7 +1,7 @@
''' '''
brozzler/browser.py - manages the browsers for brozzler brozzler/browser.py - manages the browsers for brozzler
Copyright (C) 2014-2017 Internet Archive Copyright (C) 2014-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -460,8 +460,7 @@ class Browser:
page_url) page_url)
self.navigate_to_page(page_url, timeout=page_timeout) self.navigate_to_page(page_url, timeout=page_timeout)
if on_screenshot: if on_screenshot:
jpeg_bytes = self.screenshot() self._try_screenshot(on_screenshot)
on_screenshot(jpeg_bytes)
behavior_script = brozzler.behavior_script( behavior_script = brozzler.behavior_script(
page_url, behavior_parameters, page_url, behavior_parameters,
behaviors_dir=behaviors_dir) behaviors_dir=behaviors_dir)
@ -486,6 +485,15 @@ class Browser:
self.websock_thread.on_request = None self.websock_thread.on_request = None
self.websock_thread.on_response = None self.websock_thread.on_response = None
def _try_screenshot(self, on_screenshot):
for i in range(3):
try:
jpeg_bytes = self.screenshot()
on_screenshot(jpeg_bytes)
return
except BrowsingTimeout as e:
logging.error('attempt %s/3: %s', i+1, e)
def visit_hashtags(self, page_url, hashtags, outlinks): def visit_hashtags(self, page_url, hashtags, outlinks):
_hashtags = set(hashtags or []) _hashtags = set(hashtags or [])
for outlink in outlinks: for outlink in outlinks:
@ -556,7 +564,7 @@ class Browser:
'problem extracting outlinks, result message: %s', message) 'problem extracting outlinks, result message: %s', message)
return frozenset() return frozenset()
def screenshot(self, timeout=90): def screenshot(self, timeout=45):
self.logger.info('taking screenshot') self.logger.info('taking screenshot')
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot') msg_id = self.send_to_chrome(method='Page.captureScreenshot')

View File

@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs youtube-dl on them, browses them and runs behaviors if appropriate, it runs youtube-dl on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier scopes and adds outlinks to the frontier
Copyright (C) 2014-2017 Internet Archive Copyright (C) 2014-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.