mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Skip running behaviors when page is 4xx or 5xx
Currently, when we run `Browser.browse_page`, we run JS behaviors after we navigate to a page regardless of its status. Maybe the page wasn't found (4xx) or unreachable for any reason (5xx). In that case, we could skip running behaviors to save time and resources. With this PR, we add a new var to store navigated page HTTP status in `WebsockReceiverThread.page_status`. We use this in `Browser.browser_page` to skip behaviors, outlink and hashtag extraction when page status is 4xx/5xx. Note that we don't skip screenshots as it could be useful to have a picture of an error page in some cases.
This commit is contained in:
parent
3b249333a4
commit
140c27abe8
@ -148,6 +148,7 @@ class WebsockReceiverThread(threading.Thread):
|
|||||||
|
|
||||||
self.is_open = False
|
self.is_open = False
|
||||||
self.got_page_load_event = None
|
self.got_page_load_event = None
|
||||||
|
self.page_status = None # Loaded page HTTP status code
|
||||||
self.reached_limit = None
|
self.reached_limit = None
|
||||||
|
|
||||||
self.on_request = None
|
self.on_request = None
|
||||||
@ -202,8 +203,8 @@ class WebsockReceiverThread(threading.Thread):
|
|||||||
message, exc_info=True)
|
message, exc_info=True)
|
||||||
|
|
||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
if (message['params']['response']['status'] == 420
|
status = message['params']['response'].get('status')
|
||||||
and 'Warcprox-Meta' in CaseInsensitiveDict(
|
if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict(
|
||||||
message['params']['response']['headers'])):
|
message['params']['response']['headers'])):
|
||||||
if not self.reached_limit:
|
if not self.reached_limit:
|
||||||
warcprox_meta = json.loads(CaseInsensitiveDict(
|
warcprox_meta = json.loads(CaseInsensitiveDict(
|
||||||
@ -220,6 +221,9 @@ class WebsockReceiverThread(threading.Thread):
|
|||||||
if self.on_response:
|
if self.on_response:
|
||||||
self.on_response(message)
|
self.on_response(message)
|
||||||
|
|
||||||
|
if status and self.page_status is None:
|
||||||
|
self.page_status = status
|
||||||
|
|
||||||
def _javascript_dialog_opening(self, message):
|
def _javascript_dialog_opening(self, message):
|
||||||
self.logger.info('javascript dialog opened: %s', message)
|
self.logger.info('javascript dialog opened: %s', message)
|
||||||
if message['params']['type'] == 'alert':
|
if message['params']['type'] == 'alert':
|
||||||
@ -494,19 +498,29 @@ class Browser:
|
|||||||
'login navigated away from %s; returning!',
|
'login navigated away from %s; returning!',
|
||||||
page_url)
|
page_url)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
|
# If the target page HTTP status is 4xx/5xx, there is no point
|
||||||
|
# in running behaviors, outlink and hashtag extraction as we
|
||||||
|
# didn't get a valid page. Screenshot should run because i
|
||||||
|
# may be useful to have a picture of the error page.
|
||||||
|
run_behaviors = True
|
||||||
|
if self.websock_thread.page_status is None or \
|
||||||
|
self.websock_thread.page_status >= 400:
|
||||||
|
run_behaviors = False
|
||||||
|
|
||||||
|
if run_behaviors:
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters,
|
page_url, behavior_parameters,
|
||||||
behaviors_dir=behaviors_dir)
|
behaviors_dir=behaviors_dir)
|
||||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||||
|
final_page_url = self.url()
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
self._try_screenshot(on_screenshot, screenshot_full_page)
|
self._try_screenshot(on_screenshot, screenshot_full_page)
|
||||||
if skip_extract_outlinks:
|
if not run_behaviors or skip_extract_outlinks:
|
||||||
outlinks = []
|
outlinks = []
|
||||||
else:
|
else:
|
||||||
outlinks = self.extract_outlinks()
|
outlinks = self.extract_outlinks()
|
||||||
if not skip_visit_hashtags:
|
if run_behaviors and not skip_visit_hashtags:
|
||||||
self.visit_hashtags(self.url(), hashtags, outlinks)
|
self.visit_hashtags(final_page_url, hashtags, outlinks)
|
||||||
final_page_url = self.url()
|
|
||||||
return final_page_url, outlinks
|
return final_page_url, outlinks
|
||||||
except brozzler.ReachedLimit:
|
except brozzler.ReachedLimit:
|
||||||
# websock_thread has stashed the ReachedLimit exception with
|
# websock_thread has stashed the ReachedLimit exception with
|
||||||
@ -575,6 +589,7 @@ class Browser:
|
|||||||
def navigate_to_page(self, page_url, timeout=300):
|
def navigate_to_page(self, page_url, timeout=300):
|
||||||
self.logger.info('navigating to page %s', page_url)
|
self.logger.info('navigating to page %s', page_url)
|
||||||
self.websock_thread.got_page_load_event = None
|
self.websock_thread.got_page_load_event = None
|
||||||
|
self.websock_thread.page_status = None
|
||||||
self.send_to_chrome(method='Page.navigate', params={'url': page_url})
|
self.send_to_chrome(method='Page.navigate', params={'url': page_url})
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.got_page_load_event,
|
lambda: self.websock_thread.got_page_load_event,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user