diff --git a/brozzler/cli.py b/brozzler/cli.py index cf0373b..88c92cf 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -120,8 +120,12 @@ def brozzle_page(): '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') - arg_parser.add_argument('--behavior-parameters', dest='behavior_parameters', - default=None, help='json blob of parameters to populate the javascript behavior template, e.g. {"parameter_username":"x","parameter_password":"y"}') + arg_parser.add_argument( + '--behavior-parameters', dest='behavior_parameters', + default=None, help=( + 'json blob of parameters to populate the javascript behavior ' + 'template, e.g. {"parameter_username":"x",' + '"parameter_password":"y"}')) arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') @@ -135,12 +139,13 @@ def brozzle_page(): args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) - metadata = {} + behavior_parameters = {} if args.behavior_parameters: - metadata["behavior_parameters"] = json.loads(args.behavior_parameters) + behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, - metadata=metadata, enable_warcprox_features=args.enable_warcprox_features) + enable_warcprox_features=args.enable_warcprox_features, + behavior_parameters=behavior_parameters) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 4bea483..a7f3d8e 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -69,6 +69,9 @@ id: user_agent: type: string + behavior_parameters: + type: dict + seeds: type: list required: true diff --git a/brozzler/site.py b/brozzler/site.py index 1dcd90f..5cccac8 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -96,7 +96,7 @@ class Site(brozzler.BaseDictable): status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, - cookie_db=None, user_agent=None): + cookie_db=None, user_agent=None, behavior_parameters=None): self.seed = seed self.id = id @@ -117,6 +117,7 @@ class Site(brozzler.BaseDictable): self.remember_outlinks = remember_outlinks self.cookie_db = cookie_db self.user_agent = user_agent + self.behavior_parameters = behavior_parameters self.scope = scope or {} if not "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 6fc8e32..ea6b5ec 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -269,17 +269,11 @@ class BrozzlerWorker: if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) - behavior_parameters = {} - if "login" in site.metadata: - behavior_parameters["parameter_username"] = site.metadata["login"]["username"] - behavior_parameters["parameter_password"] = site.metadata["login"]["password"] - if "behavior_parameters" in site.metadata: - behavior_parameters.update(site.metadata["behavior_parameters"]) if not browser.is_running(): browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db) outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), - behavior_parameters=behavior_parameters, + behavior_parameters=site.behavior_parameters, user_agent=site.user_agent, on_screenshot=_on_screenshot, on_url_change=page.note_redirect) diff --git a/setup.py b/setup.py index 4fcf4a7..ea86dab 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b7.dev112', + version='1.1b7.dev113', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',