Merge branch 'ari-5210' into qa

This commit is contained in:
Neil Minton 2017-07-12 17:30:06 -07:00
commit 512931b6c8
5 changed files with 38 additions and 8 deletions

View File

@ -379,7 +379,8 @@ class Browser:
self, page_url, ignore_cert_errors=False, extra_headers=None,
user_agent=None, behavior_parameters=None,
on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None):
username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False):
'''
Browses page in browser.
@ -447,8 +448,12 @@ class Browser:
behavior_script = brozzler.behavior_script(
page_url, behavior_parameters)
self.run_behavior(behavior_script, timeout=900)
outlinks = self.extract_outlinks()
self.visit_hashtags(page_url, hashtags, outlinks)
if skip_extract_outlinks:
outlinks = []
else:
outlinks = self.extract_outlinks()
if not skip_visit_hashtags:
self.visit_hashtags(page_url, hashtags, outlinks)
final_page_url = self.url()
return final_page_url, outlinks
except brozzler.ReachedLimit:

View File

@ -154,6 +154,12 @@ def brozzle_page(argv=None):
help='use this password to try to log in if a login form is found')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None, help='http proxy')
arg_parser.add_argument(
'--skip-extract-outlinks', dest='skip_extract_outlinks',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -166,7 +172,9 @@ def brozzle_page(argv=None):
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
'username': args.username, 'password': args.password})
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
@ -299,6 +307,12 @@ def brozzler_worker(argv=None):
help=(
'when needed, choose an available instance of warcprox from '
'the rethinkdb service registry'))
arg_parser.add_argument(
'--skip-extract-outlinks', dest='skip_extract_outlinks',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -331,7 +345,9 @@ def brozzler_worker(argv=None):
worker = brozzler.worker.BrozzlerWorker(
frontier, service_registry, max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe, proxy=args.proxy,
warcprox_auto=args.warcprox_auto)
warcprox_auto=args.warcprox_auto,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())

View File

@ -74,7 +74,11 @@ var umbraBehavior = {
somethingLeftAbove = true;
}
}
<<<<<<< HEAD
=======
>>>>>>> @{-1}
var clickTargets = documents[j].querySelectorAll(cssSelector);
for (var i = 0; i < clickTargets.length; i++) {
if (!this.isVisible(clickTargets[i])) {

View File

@ -103,7 +103,8 @@ class BrozzlerWorker:
def __init__(
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -112,6 +113,8 @@ class BrozzlerWorker:
self._proxy = proxy
assert not (warcprox_auto and proxy)
self._proxy_is_warcprox = None
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -406,7 +409,9 @@ class BrozzlerWorker:
username=site.get('username'), password=site.get('password'),
user_agent=site.get('user_agent'),
on_screenshot=_on_screenshot, on_response=_on_response,
hashtags=page.hashtags)
hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b12.dev264',
version='1.1b12.dev265',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',