mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-25 07:10:35 -04:00
move behavior_parameters into top level of site configuration
This commit is contained in:
parent
185d65bd5b
commit
9d66f294ec
5 changed files with 17 additions and 14 deletions
|
@ -120,8 +120,12 @@ def brozzle_page():
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
default=suggest_default_chrome_exe(),
|
default=suggest_default_chrome_exe(),
|
||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument('--behavior-parameters', dest='behavior_parameters',
|
arg_parser.add_argument(
|
||||||
default=None, help='json blob of parameters to populate the javascript behavior template, e.g. {"parameter_username":"x","parameter_password":"y"}')
|
'--behavior-parameters', dest='behavior_parameters',
|
||||||
|
default=None, help=(
|
||||||
|
'json blob of parameters to populate the javascript behavior '
|
||||||
|
'template, e.g. {"parameter_username":"x",'
|
||||||
|
'"parameter_password":"y"}'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--proxy', dest='proxy', default=None,
|
'--proxy', dest='proxy', default=None,
|
||||||
help='http proxy')
|
help='http proxy')
|
||||||
|
@ -135,12 +139,13 @@ def brozzle_page():
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
_configure_logging(args)
|
||||||
|
|
||||||
metadata = {}
|
behavior_parameters = {}
|
||||||
if args.behavior_parameters:
|
if args.behavior_parameters:
|
||||||
metadata["behavior_parameters"] = json.loads(args.behavior_parameters)
|
behavior_parameters = json.loads(args.behavior_parameters)
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
id=-1, seed=args.url, proxy=args.proxy,
|
id=-1, seed=args.url, proxy=args.proxy,
|
||||||
metadata=metadata, enable_warcprox_features=args.enable_warcprox_features)
|
enable_warcprox_features=args.enable_warcprox_features,
|
||||||
|
behavior_parameters=behavior_parameters)
|
||||||
page = brozzler.Page(url=args.url, site_id=site.id)
|
page = brozzler.Page(url=args.url, site_id=site.id)
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None)
|
worker = brozzler.BrozzlerWorker(frontier=None)
|
||||||
|
|
||||||
|
|
|
@ -69,6 +69,9 @@ id:
|
||||||
user_agent:
|
user_agent:
|
||||||
type: string
|
type: string
|
||||||
|
|
||||||
|
behavior_parameters:
|
||||||
|
type: dict
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
type: list
|
type: list
|
||||||
required: true
|
required: true
|
||||||
|
|
|
@ -96,7 +96,7 @@ class Site(brozzler.BaseDictable):
|
||||||
status="ACTIVE", claimed=False, start_time=None,
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||||
cookie_db=None, user_agent=None):
|
cookie_db=None, user_agent=None, behavior_parameters=None):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -117,6 +117,7 @@ class Site(brozzler.BaseDictable):
|
||||||
self.remember_outlinks = remember_outlinks
|
self.remember_outlinks = remember_outlinks
|
||||||
self.cookie_db = cookie_db
|
self.cookie_db = cookie_db
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
|
self.behavior_parameters = behavior_parameters
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
|
|
|
@ -269,17 +269,11 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl_spy):
|
if self._needs_browsing(page, ydl_spy):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
behavior_parameters = {}
|
|
||||||
if "login" in site.metadata:
|
|
||||||
behavior_parameters["parameter_username"] = site.metadata["login"]["username"]
|
|
||||||
behavior_parameters["parameter_password"] = site.metadata["login"]["password"]
|
|
||||||
if "behavior_parameters" in site.metadata:
|
|
||||||
behavior_parameters.update(site.metadata["behavior_parameters"])
|
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
||||||
outlinks = browser.browse_page(
|
outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
behavior_parameters=behavior_parameters,
|
behavior_parameters=site.behavior_parameters,
|
||||||
user_agent=site.user_agent,
|
user_agent=site.user_agent,
|
||||||
on_screenshot=_on_screenshot,
|
on_screenshot=_on_screenshot,
|
||||||
on_url_change=page.note_redirect)
|
on_url_change=page.note_redirect)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b7.dev112',
|
version='1.1b7.dev113',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue