diff --git a/brozzler/browser.py b/brozzler/browser.py index 7830f5f..43c8929 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -379,7 +379,8 @@ class Browser: self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, on_request=None, on_response=None, on_screenshot=None, - username=None, password=None, hashtags=None): + username=None, password=None, hashtags=None, + skip_extract_outlinks=False, skip_visit_hashtags=False): ''' Browses page in browser. @@ -447,8 +448,12 @@ class Browser: behavior_script = brozzler.behavior_script( page_url, behavior_parameters) self.run_behavior(behavior_script, timeout=900) - outlinks = self.extract_outlinks() - self.visit_hashtags(page_url, hashtags, outlinks) + if skip_extract_outlinks: + outlinks = [] + else: + outlinks = self.extract_outlinks() + if not skip_visit_hashtags: + self.visit_hashtags(page_url, hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: diff --git a/brozzler/cli.py b/brozzler/cli.py index e2b0d18..19f8b22 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -299,6 +299,12 @@ def brozzler_worker(argv=None): help=( 'when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) + arg_parser.add_argument( + '--skip-extract-outlinks', dest='skip_extract_outlinks', + action='store_true', help='extract page outlinks by default') + arg_parser.add_argument( + '--skip-visit-hashtags', dest='skip_visit_hashtags', + action='store_true', help='visit page hashtags by default') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -331,7 +337,9 @@ def brozzler_worker(argv=None): worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, - warcprox_auto=args.warcprox_auto) + warcprox_auto=args.warcprox_auto, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) diff --git a/brozzler/worker.py b/brozzler/worker.py index c2a76be..3169e47 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -103,7 +103,8 @@ class BrozzlerWorker: def __init__( self, frontier, service_registry=None, max_browsers=1, - chrome_exe="chromium-browser", warcprox_auto=False, proxy=None): + chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, + skip_extract_outlinks=False, skip_visit_hashtags=False): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -112,6 +113,8 @@ class BrozzlerWorker: self._proxy = proxy assert not (warcprox_auto and proxy) self._proxy_is_warcprox = None + self._skip_extract_outlinks = skip_extract_outlinks + self._skip_visit_hashtags = skip_visit_hashtags self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -406,7 +409,9 @@ class BrozzlerWorker: username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, - hashtags=page.hashtags) + hashtags=page.hashtags, + skip_extract_outlinks=self._skip_extract_outlinks, + skip_visit_hashtags=self._skip_visit_hashtags) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks