move behavior_parameters into top level of site configuration

This commit is contained in:
Noah Levitt 2016-11-07 18:16:04 -08:00
parent 185d65bd5b
commit 9d66f294ec
5 changed files with 17 additions and 14 deletions

View File

@ -120,8 +120,12 @@ def brozzle_page():
'-e', '--chrome-exe', dest='chrome_exe',
default=suggest_default_chrome_exe(),
help='executable to use to invoke chrome')
arg_parser.add_argument('--behavior-parameters', dest='behavior_parameters',
default=None, help='json blob of parameters to populate the javascript behavior template, e.g. {"parameter_username":"x","parameter_password":"y"}')
arg_parser.add_argument(
'--behavior-parameters', dest='behavior_parameters',
default=None, help=(
'json blob of parameters to populate the javascript behavior '
'template, e.g. {"parameter_username":"x",'
'"parameter_password":"y"}'))
arg_parser.add_argument(
'--proxy', dest='proxy', default=None,
help='http proxy')
@ -135,12 +139,13 @@ def brozzle_page():
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
metadata = {}
behavior_parameters = {}
if args.behavior_parameters:
metadata["behavior_parameters"] = json.loads(args.behavior_parameters)
behavior_parameters = json.loads(args.behavior_parameters)
site = brozzler.Site(
id=-1, seed=args.url, proxy=args.proxy,
metadata=metadata, enable_warcprox_features=args.enable_warcprox_features)
enable_warcprox_features=args.enable_warcprox_features,
behavior_parameters=behavior_parameters)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)

View File

@ -69,6 +69,9 @@ id:
user_agent:
type: string
behavior_parameters:
type: dict
seeds:
type: list
required: true

View File

@ -96,7 +96,7 @@ class Site(brozzler.BaseDictable):
status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
cookie_db=None, user_agent=None):
cookie_db=None, user_agent=None, behavior_parameters=None):
self.seed = seed
self.id = id
@ -117,6 +117,7 @@ class Site(brozzler.BaseDictable):
self.remember_outlinks = remember_outlinks
self.cookie_db = cookie_db
self.user_agent = user_agent
self.behavior_parameters = behavior_parameters
self.scope = scope or {}
if not "surt" in self.scope:

View File

@ -269,17 +269,11 @@ class BrozzlerWorker:
if self._needs_browsing(page, ydl_spy):
self.logger.info('needs browsing: %s', page)
behavior_parameters = {}
if "login" in site.metadata:
behavior_parameters["parameter_username"] = site.metadata["login"]["username"]
behavior_parameters["parameter_password"] = site.metadata["login"]["password"]
if "behavior_parameters" in site.metadata:
behavior_parameters.update(site.metadata["behavior_parameters"])
if not browser.is_running():
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(),
behavior_parameters=behavior_parameters,
behavior_parameters=site.behavior_parameters,
user_agent=site.user_agent,
on_screenshot=_on_screenshot,
on_url_change=page.note_redirect)

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b7.dev112',
version='1.1b7.dev113',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',