diff --git a/brozzler/browser.py b/brozzler/browser.py index 33c1ef1..6c210ff 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -148,6 +148,7 @@ class WebsockReceiverThread(threading.Thread): self.is_open = False self.got_page_load_event = None + self.page_status = None # Loaded page HTTP status code self.reached_limit = None self.on_request = None @@ -202,9 +203,9 @@ class WebsockReceiverThread(threading.Thread): message, exc_info=True) def _network_response_received(self, message): - if (message['params']['response']['status'] == 420 - and 'Warcprox-Meta' in CaseInsensitiveDict( - message['params']['response']['headers'])): + status = message['params']['response'].get('status') + if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict( + message['params']['response']['headers'])): if not self.reached_limit: warcprox_meta = json.loads(CaseInsensitiveDict( message['params']['response']['headers'])['Warcprox-Meta']) @@ -220,6 +221,9 @@ class WebsockReceiverThread(threading.Thread): if self.on_response: self.on_response(message) + if status and self.page_status is None: + self.page_status = status + def _javascript_dialog_opening(self, message): self.logger.info('javascript dialog opened: %s', message) if message['params']['type'] == 'alert': @@ -418,8 +422,8 @@ class Browser: on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, - skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, - behavior_timeout=900): + skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, + page_timeout=300, behavior_timeout=900): ''' Browses page in browser. @@ -494,19 +498,30 @@ class Browser: 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) - behavior_script = brozzler.behavior_script( - page_url, behavior_parameters, - behaviors_dir=behaviors_dir) - self.run_behavior(behavior_script, timeout=behavior_timeout) + # If the target page HTTP status is 4xx/5xx, there is no point + # in running behaviors, outlink and hashtag extraction as we + # didn't get a valid page. Screenshot should run because i + # may be useful to have a picture of the error page. + # This is only enabled with option `simpler404`. + run_behaviors = True + if simpler404 and (self.websock_thread.page_status is None or + self.websock_thread.page_status >= 400): + run_behaviors = False + + if run_behaviors: + behavior_script = brozzler.behavior_script( + page_url, behavior_parameters, + behaviors_dir=behaviors_dir) + self.run_behavior(behavior_script, timeout=behavior_timeout) + final_page_url = self.url() if on_screenshot: self._try_screenshot(on_screenshot, screenshot_full_page) - if skip_extract_outlinks: + if not run_behaviors or skip_extract_outlinks: outlinks = [] else: outlinks = self.extract_outlinks() - if not skip_visit_hashtags: - self.visit_hashtags(self.url(), hashtags, outlinks) - final_page_url = self.url() + if run_behaviors and not skip_visit_hashtags: + self.visit_hashtags(final_page_url, hashtags, outlinks) return final_page_url, outlinks except brozzler.ReachedLimit: # websock_thread has stashed the ReachedLimit exception with @@ -575,6 +590,7 @@ class Browser: def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) self.websock_thread.got_page_load_event = None + self.websock_thread.page_status = None self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self._wait_for( lambda: self.websock_thread.got_page_load_event, diff --git a/brozzler/cli.py b/brozzler/cli.py index 1cb5912..347487f 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -164,6 +164,8 @@ def brozzle_page(argv=None): action='store_true') arg_parser.add_argument( '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true') + arg_parser.add_argument( + '--simpler404', dest='simpler404', action='store_true') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -182,6 +184,7 @@ def brozzle_page(argv=None): skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, + simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page) def on_screenshot(screenshot_jpeg): diff --git a/brozzler/worker.py b/brozzler/worker.py index 7437927..c8279b2 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -51,7 +51,7 @@ class BrozzlerWorker: self, frontier, service_registry=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, - skip_youtube_dl=False, screenshot_full_page=False, + skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, page_timeout=300, behavior_timeout=900): self._frontier = frontier self._service_registry = service_registry @@ -64,6 +64,7 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + self._simpler404 = simpler404 self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout @@ -302,6 +303,7 @@ class BrozzlerWorker: skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, + simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout)