mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 16:16:28 -04:00
Merge branch 'ari-5210' into qa
This commit is contained in:
commit
512931b6c8
@ -379,7 +379,8 @@ class Browser:
|
||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
||||
user_agent=None, behavior_parameters=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
username=None, password=None, hashtags=None):
|
||||
username=None, password=None, hashtags=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
@ -447,8 +448,12 @@ class Browser:
|
||||
behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters)
|
||||
self.run_behavior(behavior_script, timeout=900)
|
||||
outlinks = self.extract_outlinks()
|
||||
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||
if skip_extract_outlinks:
|
||||
outlinks = []
|
||||
else:
|
||||
outlinks = self.extract_outlinks()
|
||||
if not skip_visit_hashtags:
|
||||
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||
final_page_url = self.url()
|
||||
return final_page_url, outlinks
|
||||
except brozzler.ReachedLimit:
|
||||
|
@ -154,6 +154,12 @@ def brozzle_page(argv=None):
|
||||
help='use this password to try to log in if a login form is found')
|
||||
arg_parser.add_argument(
|
||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||
arg_parser.add_argument(
|
||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -166,7 +172,9 @@ def brozzle_page(argv=None):
|
||||
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
|
||||
'username': args.username, 'password': args.password})
|
||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||
|
||||
def on_screenshot(screenshot_png):
|
||||
OK_CHARS = (string.ascii_letters + string.digits)
|
||||
@ -299,6 +307,12 @@ def brozzler_worker(argv=None):
|
||||
help=(
|
||||
'when needed, choose an available instance of warcprox from '
|
||||
'the rethinkdb service registry'))
|
||||
arg_parser.add_argument(
|
||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -331,7 +345,9 @@ def brozzler_worker(argv=None):
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier, service_registry, max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
||||
warcprox_auto=args.warcprox_auto)
|
||||
warcprox_auto=args.warcprox_auto,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||
|
@ -74,7 +74,11 @@ var umbraBehavior = {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
<<<<<<< HEAD
|
||||
|
||||
=======
|
||||
|
||||
>>>>>>> @{-1}
|
||||
var clickTargets = documents[j].querySelectorAll(cssSelector);
|
||||
for (var i = 0; i < clickTargets.length; i++) {
|
||||
if (!this.isVisible(clickTargets[i])) {
|
||||
|
@ -103,7 +103,8 @@ class BrozzlerWorker:
|
||||
|
||||
def __init__(
|
||||
self, frontier, service_registry=None, max_browsers=1,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -112,6 +113,8 @@ class BrozzlerWorker:
|
||||
self._proxy = proxy
|
||||
assert not (warcprox_auto and proxy)
|
||||
self._proxy_is_warcprox = None
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
|
||||
self._browser_pool = brozzler.browser.BrowserPool(
|
||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
@ -406,7 +409,9 @@ class BrozzlerWorker:
|
||||
username=site.get('username'), password=site.get('password'),
|
||||
user_agent=site.get('user_agent'),
|
||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||
hashtags=page.hashtags)
|
||||
hashtags=page.hashtags,
|
||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||
skip_visit_hashtags=self._skip_visit_hashtags)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
|
Loading…
x
Reference in New Issue
Block a user