From 80341b9106be53dc58132655346f04cec5110c00 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 1 Apr 2020 16:08:43 +0000 Subject: [PATCH] Add option simpler404 to enable this behavior It is disabled by default. --- brozzler/browser.py | 9 +++++---- brozzler/cli.py | 3 +++ brozzler/worker.py | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 9031128..6c210ff 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -422,8 +422,8 @@ class Browser: on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, - skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, - behavior_timeout=900): + skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, + page_timeout=300, behavior_timeout=900): ''' Browses page in browser. @@ -502,9 +502,10 @@ class Browser: # in running behaviors, outlink and hashtag extraction as we # didn't get a valid page. Screenshot should run because i # may be useful to have a picture of the error page. + # This is only enabled with option `simpler404`. run_behaviors = True - if self.websock_thread.page_status is None or \ - self.websock_thread.page_status >= 400: + if simpler404 and (self.websock_thread.page_status is None or + self.websock_thread.page_status >= 400): run_behaviors = False if run_behaviors: diff --git a/brozzler/cli.py b/brozzler/cli.py index 1cb5912..347487f 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -164,6 +164,8 @@ def brozzle_page(argv=None): action='store_true') arg_parser.add_argument( '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true') + arg_parser.add_argument( + '--simpler404', dest='simpler404', action='store_true') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -182,6 +184,7 @@ def brozzle_page(argv=None): skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, + simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page) def on_screenshot(screenshot_jpeg): diff --git a/brozzler/worker.py b/brozzler/worker.py index 7437927..c8279b2 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -51,7 +51,7 @@ class BrozzlerWorker: self, frontier, service_registry=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, - skip_youtube_dl=False, screenshot_full_page=False, + skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, page_timeout=300, behavior_timeout=900): self._frontier = frontier self._service_registry = service_registry @@ -64,6 +64,7 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + self._simpler404 = simpler404 self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout @@ -302,6 +303,7 @@ class BrozzlerWorker: skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, + simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout)