Add option simpler404 to enable this behavior

It is disabled by default.
This commit is contained in:
Vangelis Banos 2020-04-01 16:08:43 +00:00
parent 140c27abe8
commit 80341b9106
3 changed files with 11 additions and 5 deletions

View File

@ -422,8 +422,8 @@ class Browser:
on_service_worker_version_updated=None, on_screenshot=None, on_service_worker_version_updated=None, on_screenshot=None,
username=None, password=None, hashtags=None, username=None, password=None, hashtags=None,
screenshot_full_page=False, skip_extract_outlinks=False, screenshot_full_page=False, skip_extract_outlinks=False,
skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
behavior_timeout=900): page_timeout=300, behavior_timeout=900):
''' '''
Browses page in browser. Browses page in browser.
@ -502,9 +502,10 @@ class Browser:
# in running behaviors, outlink and hashtag extraction as we # in running behaviors, outlink and hashtag extraction as we
# didn't get a valid page. Screenshot should run because i # didn't get a valid page. Screenshot should run because i
# may be useful to have a picture of the error page. # may be useful to have a picture of the error page.
# This is only enabled with option `simpler404`.
run_behaviors = True run_behaviors = True
if self.websock_thread.page_status is None or \ if simpler404 and (self.websock_thread.page_status is None or
self.websock_thread.page_status >= 400: self.websock_thread.page_status >= 400):
run_behaviors = False run_behaviors = False
if run_behaviors: if run_behaviors:

View File

@ -164,6 +164,8 @@ def brozzle_page(argv=None):
action='store_true') action='store_true')
arg_parser.add_argument( arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl', action='store_true') '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true')
arg_parser.add_argument(
'--simpler404', dest='simpler404', action='store_true')
add_common_options(arg_parser, argv) add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
@ -182,6 +184,7 @@ def brozzle_page(argv=None):
skip_extract_outlinks=args.skip_extract_outlinks, skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags, skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl, skip_youtube_dl=args.skip_youtube_dl,
simpler404=args.simpler404,
screenshot_full_page=args.screenshot_full_page) screenshot_full_page=args.screenshot_full_page)
def on_screenshot(screenshot_jpeg): def on_screenshot(screenshot_jpeg):

View File

@ -51,7 +51,7 @@ class BrozzlerWorker:
self, frontier, service_registry=None, max_browsers=1, self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False, skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, screenshot_full_page=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900): page_timeout=300, behavior_timeout=900):
self._frontier = frontier self._frontier = frontier
self._service_registry = service_registry self._service_registry = service_registry
@ -64,6 +64,7 @@ class BrozzlerWorker:
self._skip_extract_outlinks = skip_extract_outlinks self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl self._skip_youtube_dl = skip_youtube_dl
self._simpler404 = simpler404
self._screenshot_full_page = screenshot_full_page self._screenshot_full_page = screenshot_full_page
self._page_timeout = page_timeout self._page_timeout = page_timeout
self._behavior_timeout = behavior_timeout self._behavior_timeout = behavior_timeout
@ -302,6 +303,7 @@ class BrozzlerWorker:
skip_extract_outlinks=self._skip_extract_outlinks, skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags, skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl, skip_youtube_dl=self._skip_youtube_dl,
simpler404=self._simpler404,
screenshot_full_page=self._screenshot_full_page, screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout, page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout) behavior_timeout=self._behavior_timeout)