Skip running behaviors when page is 4xx or 5xx

Currently, when we run `Browser.browse_page`, we run JS behaviors after
we navigate to a page regardless of its status.
Maybe the page wasn't found (4xx) or unreachable for any reason (5xx).
In that case, we could skip running behaviors to save time and
resources.

With this PR, we add a new var to store navigated page HTTP status in
`WebsockReceiverThread.page_status`. We use this in
`Browser.browser_page` to skip behaviors, outlink and hashtag extraction
when page status is 4xx/5xx.

Note that we don't skip screenshots as it could be useful to have a
picture of an error page in some cases.
This commit is contained in:
Vangelis Banos 2020-03-23 16:21:57 +00:00
parent 3b249333a4
commit 140c27abe8

View File

@ -148,6 +148,7 @@ class WebsockReceiverThread(threading.Thread):
self.is_open = False self.is_open = False
self.got_page_load_event = None self.got_page_load_event = None
self.page_status = None # Loaded page HTTP status code
self.reached_limit = None self.reached_limit = None
self.on_request = None self.on_request = None
@ -202,9 +203,9 @@ class WebsockReceiverThread(threading.Thread):
message, exc_info=True) message, exc_info=True)
def _network_response_received(self, message): def _network_response_received(self, message):
if (message['params']['response']['status'] == 420 status = message['params']['response'].get('status')
and 'Warcprox-Meta' in CaseInsensitiveDict( if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict(
message['params']['response']['headers'])): message['params']['response']['headers'])):
if not self.reached_limit: if not self.reached_limit:
warcprox_meta = json.loads(CaseInsensitiveDict( warcprox_meta = json.loads(CaseInsensitiveDict(
message['params']['response']['headers'])['Warcprox-Meta']) message['params']['response']['headers'])['Warcprox-Meta'])
@ -220,6 +221,9 @@ class WebsockReceiverThread(threading.Thread):
if self.on_response: if self.on_response:
self.on_response(message) self.on_response(message)
if status and self.page_status is None:
self.page_status = status
def _javascript_dialog_opening(self, message): def _javascript_dialog_opening(self, message):
self.logger.info('javascript dialog opened: %s', message) self.logger.info('javascript dialog opened: %s', message)
if message['params']['type'] == 'alert': if message['params']['type'] == 'alert':
@ -494,19 +498,29 @@ class Browser:
'login navigated away from %s; returning!', 'login navigated away from %s; returning!',
page_url) page_url)
self.navigate_to_page(page_url, timeout=page_timeout) self.navigate_to_page(page_url, timeout=page_timeout)
behavior_script = brozzler.behavior_script( # If the target page HTTP status is 4xx/5xx, there is no point
page_url, behavior_parameters, # in running behaviors, outlink and hashtag extraction as we
behaviors_dir=behaviors_dir) # didn't get a valid page. Screenshot should run because i
self.run_behavior(behavior_script, timeout=behavior_timeout) # may be useful to have a picture of the error page.
run_behaviors = True
if self.websock_thread.page_status is None or \
self.websock_thread.page_status >= 400:
run_behaviors = False
if run_behaviors:
behavior_script = brozzler.behavior_script(
page_url, behavior_parameters,
behaviors_dir=behaviors_dir)
self.run_behavior(behavior_script, timeout=behavior_timeout)
final_page_url = self.url()
if on_screenshot: if on_screenshot:
self._try_screenshot(on_screenshot, screenshot_full_page) self._try_screenshot(on_screenshot, screenshot_full_page)
if skip_extract_outlinks: if not run_behaviors or skip_extract_outlinks:
outlinks = [] outlinks = []
else: else:
outlinks = self.extract_outlinks() outlinks = self.extract_outlinks()
if not skip_visit_hashtags: if run_behaviors and not skip_visit_hashtags:
self.visit_hashtags(self.url(), hashtags, outlinks) self.visit_hashtags(final_page_url, hashtags, outlinks)
final_page_url = self.url()
return final_page_url, outlinks return final_page_url, outlinks
except brozzler.ReachedLimit: except brozzler.ReachedLimit:
# websock_thread has stashed the ReachedLimit exception with # websock_thread has stashed the ReachedLimit exception with
@ -575,6 +589,7 @@ class Browser:
def navigate_to_page(self, page_url, timeout=300): def navigate_to_page(self, page_url, timeout=300):
self.logger.info('navigating to page %s', page_url) self.logger.info('navigating to page %s', page_url)
self.websock_thread.got_page_load_event = None self.websock_thread.got_page_load_event = None
self.websock_thread.page_status = None
self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self.send_to_chrome(method='Page.navigate', params={'url': page_url})
self._wait_for( self._wait_for(
lambda: self.websock_thread.got_page_load_event, lambda: self.websock_thread.got_page_load_event,