diff --git a/brozzler/browser.py b/brozzler/browser.py index 988f8b2..9442d92 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -379,7 +379,8 @@ class Browser: self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, on_request=None, on_response=None, on_screenshot=None, - username=None, password=None, hashtags=None): + username=None, password=None, hashtags=None, + skip_extract_outlinks=False, skip_visit_hashtags=False): ''' Browses page in browser. @@ -447,8 +448,12 @@ class Browser: behavior_script = brozzler.behavior_script( page_url, behavior_parameters) self.run_behavior(behavior_script, timeout=900) - outlinks = self.extract_outlinks() - self.visit_hashtags(page_url, hashtags, outlinks) + if skip_extract_outlinks: + outlinks = [] + else: + outlinks = self.extract_outlinks() + if not skip_visit_hashtags: + self.visit_hashtags(page_url, hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: diff --git a/brozzler/cli.py b/brozzler/cli.py index e2b0d18..38ea689 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -154,6 +154,12 @@ def brozzle_page(argv=None): help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--skip-extract-outlinks', dest='skip_extract_outlinks', + action='store_true', help=argparse.SUPPRESS) + arg_parser.add_argument( + '--skip-visit-hashtags', dest='skip_visit_hashtags', + action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -166,7 +172,9 @@ def brozzle_page(argv=None): 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password}) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) - worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy) + worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) @@ -299,6 +307,12 @@ def brozzler_worker(argv=None): help=( 'when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) + arg_parser.add_argument( + '--skip-extract-outlinks', dest='skip_extract_outlinks', + action='store_true', help=argparse.SUPPRESS) + arg_parser.add_argument( + '--skip-visit-hashtags', dest='skip_visit_hashtags', + action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -331,7 +345,9 @@ def brozzler_worker(argv=None): worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, - warcprox_auto=args.warcprox_auto) + warcprox_auto=args.warcprox_auto, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) diff --git a/brozzler/js-templates/pitchfork.js b/brozzler/js-templates/pitchfork.js index a9df49b..6a47ef6 100644 --- a/brozzler/js-templates/pitchfork.js +++ b/brozzler/js-templates/pitchfork.js @@ -74,7 +74,11 @@ var umbraBehavior = { somethingLeftAbove = true; } } +<<<<<<< HEAD +======= + +>>>>>>> @{-1} var clickTargets = documents[j].querySelectorAll(cssSelector); for (var i = 0; i < clickTargets.length; i++) { if (!this.isVisible(clickTargets[i])) { diff --git a/brozzler/worker.py b/brozzler/worker.py index c2a76be..3169e47 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -103,7 +103,8 @@ class BrozzlerWorker: def __init__( self, frontier, service_registry=None, max_browsers=1, - chrome_exe="chromium-browser", warcprox_auto=False, proxy=None): + chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, + skip_extract_outlinks=False, skip_visit_hashtags=False): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -112,6 +113,8 @@ class BrozzlerWorker: self._proxy = proxy assert not (warcprox_auto and proxy) self._proxy_is_warcprox = None + self._skip_extract_outlinks = skip_extract_outlinks + self._skip_visit_hashtags = skip_visit_hashtags self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -406,7 +409,9 @@ class BrozzlerWorker: username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, - hashtags=page.hashtags) + hashtags=page.hashtags, + skip_extract_outlinks=self._skip_extract_outlinks, + skip_visit_hashtags=self._skip_visit_hashtags) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks diff --git a/setup.py b/setup.py index f102c15..743bccb 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev264', + version='1.1b12.dev265', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',