mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-01 02:56:10 -04:00
Merge pull request #190 from vbanos/skip-behaviors-on-error
Thank you, @vbanos!
This commit is contained in:
commit
ffea189d15
3 changed files with 35 additions and 14 deletions
|
@ -148,6 +148,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
|
|
||||||
self.is_open = False
|
self.is_open = False
|
||||||
self.got_page_load_event = None
|
self.got_page_load_event = None
|
||||||
|
self.page_status = None # Loaded page HTTP status code
|
||||||
self.reached_limit = None
|
self.reached_limit = None
|
||||||
|
|
||||||
self.on_request = None
|
self.on_request = None
|
||||||
|
@ -202,8 +203,8 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
message, exc_info=True)
|
message, exc_info=True)
|
||||||
|
|
||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
if (message['params']['response']['status'] == 420
|
status = message['params']['response'].get('status')
|
||||||
and 'Warcprox-Meta' in CaseInsensitiveDict(
|
if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict(
|
||||||
message['params']['response']['headers'])):
|
message['params']['response']['headers'])):
|
||||||
if not self.reached_limit:
|
if not self.reached_limit:
|
||||||
warcprox_meta = json.loads(CaseInsensitiveDict(
|
warcprox_meta = json.loads(CaseInsensitiveDict(
|
||||||
|
@ -220,6 +221,9 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
if self.on_response:
|
if self.on_response:
|
||||||
self.on_response(message)
|
self.on_response(message)
|
||||||
|
|
||||||
|
if status and self.page_status is None:
|
||||||
|
self.page_status = status
|
||||||
|
|
||||||
def _javascript_dialog_opening(self, message):
|
def _javascript_dialog_opening(self, message):
|
||||||
self.logger.info('javascript dialog opened: %s', message)
|
self.logger.info('javascript dialog opened: %s', message)
|
||||||
if message['params']['type'] == 'alert':
|
if message['params']['type'] == 'alert':
|
||||||
|
@ -418,8 +422,8 @@ class Browser:
|
||||||
on_service_worker_version_updated=None, on_screenshot=None,
|
on_service_worker_version_updated=None, on_screenshot=None,
|
||||||
username=None, password=None, hashtags=None,
|
username=None, password=None, hashtags=None,
|
||||||
screenshot_full_page=False, skip_extract_outlinks=False,
|
screenshot_full_page=False, skip_extract_outlinks=False,
|
||||||
skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300,
|
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
|
||||||
behavior_timeout=900):
|
page_timeout=300, behavior_timeout=900):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
|
@ -494,19 +498,30 @@ class Browser:
|
||||||
'login navigated away from %s; returning!',
|
'login navigated away from %s; returning!',
|
||||||
page_url)
|
page_url)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
|
# If the target page HTTP status is 4xx/5xx, there is no point
|
||||||
|
# in running behaviors, outlink and hashtag extraction as we
|
||||||
|
# didn't get a valid page. Screenshot should run because i
|
||||||
|
# may be useful to have a picture of the error page.
|
||||||
|
# This is only enabled with option `simpler404`.
|
||||||
|
run_behaviors = True
|
||||||
|
if simpler404 and (self.websock_thread.page_status is None or
|
||||||
|
self.websock_thread.page_status >= 400):
|
||||||
|
run_behaviors = False
|
||||||
|
|
||||||
|
if run_behaviors:
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters,
|
page_url, behavior_parameters,
|
||||||
behaviors_dir=behaviors_dir)
|
behaviors_dir=behaviors_dir)
|
||||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||||
|
final_page_url = self.url()
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
self._try_screenshot(on_screenshot, screenshot_full_page)
|
self._try_screenshot(on_screenshot, screenshot_full_page)
|
||||||
if skip_extract_outlinks:
|
if not run_behaviors or skip_extract_outlinks:
|
||||||
outlinks = []
|
outlinks = []
|
||||||
else:
|
else:
|
||||||
outlinks = self.extract_outlinks()
|
outlinks = self.extract_outlinks()
|
||||||
if not skip_visit_hashtags:
|
if run_behaviors and not skip_visit_hashtags:
|
||||||
self.visit_hashtags(self.url(), hashtags, outlinks)
|
self.visit_hashtags(final_page_url, hashtags, outlinks)
|
||||||
final_page_url = self.url()
|
|
||||||
return final_page_url, outlinks
|
return final_page_url, outlinks
|
||||||
except brozzler.ReachedLimit:
|
except brozzler.ReachedLimit:
|
||||||
# websock_thread has stashed the ReachedLimit exception with
|
# websock_thread has stashed the ReachedLimit exception with
|
||||||
|
@ -575,6 +590,7 @@ class Browser:
|
||||||
def navigate_to_page(self, page_url, timeout=300):
|
def navigate_to_page(self, page_url, timeout=300):
|
||||||
self.logger.info('navigating to page %s', page_url)
|
self.logger.info('navigating to page %s', page_url)
|
||||||
self.websock_thread.got_page_load_event = None
|
self.websock_thread.got_page_load_event = None
|
||||||
|
self.websock_thread.page_status = None
|
||||||
self.send_to_chrome(method='Page.navigate', params={'url': page_url})
|
self.send_to_chrome(method='Page.navigate', params={'url': page_url})
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.got_page_load_event,
|
lambda: self.websock_thread.got_page_load_event,
|
||||||
|
|
|
@ -164,6 +164,8 @@ def brozzle_page(argv=None):
|
||||||
action='store_true')
|
action='store_true')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-youtube-dl', dest='skip_youtube_dl', action='store_true')
|
'--skip-youtube-dl', dest='skip_youtube_dl', action='store_true')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--simpler404', dest='simpler404', action='store_true')
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
@ -182,6 +184,7 @@ def brozzle_page(argv=None):
|
||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
skip_youtube_dl=args.skip_youtube_dl,
|
skip_youtube_dl=args.skip_youtube_dl,
|
||||||
|
simpler404=args.simpler404,
|
||||||
screenshot_full_page=args.screenshot_full_page)
|
screenshot_full_page=args.screenshot_full_page)
|
||||||
|
|
||||||
def on_screenshot(screenshot_jpeg):
|
def on_screenshot(screenshot_jpeg):
|
||||||
|
|
|
@ -51,7 +51,7 @@ class BrozzlerWorker:
|
||||||
self, frontier, service_registry=None, max_browsers=1,
|
self, frontier, service_registry=None, max_browsers=1,
|
||||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||||
skip_youtube_dl=False, screenshot_full_page=False,
|
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
||||||
page_timeout=300, behavior_timeout=900):
|
page_timeout=300, behavior_timeout=900):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
|
@ -64,6 +64,7 @@ class BrozzlerWorker:
|
||||||
self._skip_extract_outlinks = skip_extract_outlinks
|
self._skip_extract_outlinks = skip_extract_outlinks
|
||||||
self._skip_visit_hashtags = skip_visit_hashtags
|
self._skip_visit_hashtags = skip_visit_hashtags
|
||||||
self._skip_youtube_dl = skip_youtube_dl
|
self._skip_youtube_dl = skip_youtube_dl
|
||||||
|
self._simpler404 = simpler404
|
||||||
self._screenshot_full_page = screenshot_full_page
|
self._screenshot_full_page = screenshot_full_page
|
||||||
self._page_timeout = page_timeout
|
self._page_timeout = page_timeout
|
||||||
self._behavior_timeout = behavior_timeout
|
self._behavior_timeout = behavior_timeout
|
||||||
|
@ -302,6 +303,7 @@ class BrozzlerWorker:
|
||||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||||
skip_youtube_dl=self._skip_youtube_dl,
|
skip_youtube_dl=self._skip_youtube_dl,
|
||||||
|
simpler404=self._simpler404,
|
||||||
screenshot_full_page=self._screenshot_full_page,
|
screenshot_full_page=self._screenshot_full_page,
|
||||||
page_timeout=self._page_timeout,
|
page_timeout=self._page_timeout,
|
||||||
behavior_timeout=self._behavior_timeout)
|
behavior_timeout=self._behavior_timeout)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue