mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
try taking screenshot 3 times, proceed on failure
We've been seeing a lot of this: 2018-02-14 20:06:01,472 13286 CRITICAL BrozzlingThread:44789 brozzler.worker.BrozzlerWorker.brozzle_site(worker.py:559) unexpected exception Traceback (most recent call last): File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 528, in brozzle_site enable_youtube_dl=not self._skip_youtube_dl) File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 385, in brozzle_page on_request) File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 459, in _browse_page behavior_timeout=self._behavior_timeout) File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/browser.py", line 463, in browse_page jpeg_bytes = self.screenshot() File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/browser.py", line 565, in screenshot timeout=timeout) File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/browser.py", line 311, in _wait_for elapsed, callback)) brozzler.browser.BrowsingTimeout: timed out after 90.5s waiting for: <function Browser.screenshot.<locals>.<lambda> at 0x7f5ab0076a68> Browser bug, maybe? To work around it, reduce timeout to 45 seconds, try getting screenshot 3 times, and if it fails proceed anyway, don't queue the page for recrawling.
This commit is contained in:
parent
0faeaab3ac
commit
b38fbdcda6
@ -1,7 +1,7 @@
|
|||||||
'''
|
'''
|
||||||
brozzler/browser.py - manages the browsers for brozzler
|
brozzler/browser.py - manages the browsers for brozzler
|
||||||
|
|
||||||
Copyright (C) 2014-2017 Internet Archive
|
Copyright (C) 2014-2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -460,8 +460,7 @@ class Browser:
|
|||||||
page_url)
|
page_url)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
jpeg_bytes = self.screenshot()
|
self._try_screenshot(on_screenshot)
|
||||||
on_screenshot(jpeg_bytes)
|
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters,
|
page_url, behavior_parameters,
|
||||||
behaviors_dir=behaviors_dir)
|
behaviors_dir=behaviors_dir)
|
||||||
@ -486,6 +485,15 @@ class Browser:
|
|||||||
self.websock_thread.on_request = None
|
self.websock_thread.on_request = None
|
||||||
self.websock_thread.on_response = None
|
self.websock_thread.on_response = None
|
||||||
|
|
||||||
|
def _try_screenshot(self, on_screenshot):
|
||||||
|
for i in range(3):
|
||||||
|
try:
|
||||||
|
jpeg_bytes = self.screenshot()
|
||||||
|
on_screenshot(jpeg_bytes)
|
||||||
|
return
|
||||||
|
except BrowsingTimeout as e:
|
||||||
|
logging.error('attempt %s/3: %s', i+1, e)
|
||||||
|
|
||||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||||
_hashtags = set(hashtags or [])
|
_hashtags = set(hashtags or [])
|
||||||
for outlink in outlinks:
|
for outlink in outlinks:
|
||||||
@ -556,7 +564,7 @@ class Browser:
|
|||||||
'problem extracting outlinks, result message: %s', message)
|
'problem extracting outlinks, result message: %s', message)
|
||||||
return frozenset()
|
return frozenset()
|
||||||
|
|
||||||
def screenshot(self, timeout=90):
|
def screenshot(self, timeout=45):
|
||||||
self.logger.info('taking screenshot')
|
self.logger.info('taking screenshot')
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(method='Page.captureScreenshot')
|
msg_id = self.send_to_chrome(method='Page.captureScreenshot')
|
||||||
|
@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
|||||||
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
||||||
scopes and adds outlinks to the frontier
|
scopes and adds outlinks to the frontier
|
||||||
|
|
||||||
Copyright (C) 2014-2017 Internet Archive
|
Copyright (C) 2014-2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user