Merge branch 'master' into fix-travis

* master:
  pass behavior_parameters from job configuration into Site objects
  add --behavior-parameters argument to brozzler-new-site
  fix bug in final_bounces (not sure what I was thinking)
This commit is contained in:
Noah Levitt 2016-11-09 16:16:16 -08:00
commit 119fa5ae5d
4 changed files with 19 additions and 9 deletions

View File

@ -224,6 +224,12 @@ def brozzler_new_site():
'Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are '
'enabled'))
arg_parser.add_argument(
'--behavior-parameters', dest='behavior_parameters',
default=None, help=(
'json blob of parameters to populate the javascript behavior '
'template, e.g. {"parameter_username":"x",'
'"parameter_password":"y"}'))
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
@ -234,8 +240,10 @@ def brozzler_new_site():
time_limit=int(args.time_limit) if args.time_limit else None,
ignore_robots=args.ignore_robots,
enable_warcprox_features=args.enable_warcprox_features,
warcprox_meta=(
json.loads(args.warcprox_meta) if args.warcprox_meta else None))
warcprox_meta=json.loads(
args.warcprox_meta) if args.warcprox_meta else None,
behavior_parameters=json.loads(
args.behavior_parameters) if args.behavior_parameters else None)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)

View File

@ -88,7 +88,8 @@ def new_job(frontier, job_conf):
warcprox_meta=merged_conf.get("warcprox_meta"),
metadata=merged_conf.get("metadata"),
remember_outlinks=merged_conf.get("remember_outlinks"),
user_agent=merged_conf.get("user_agent"))
user_agent=merged_conf.get("user_agent"),
behavior_parameters=merged_conf.get("behavior_parameters"))
sites.append(site)
# insert all the sites into database before the job

View File

@ -48,6 +48,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
class YoutubeDLSpy(urllib.request.BaseHandler):
Transaction = collections.namedtuple('Transaction', ['request', 'response'])
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self):
self.reset()
@ -62,17 +63,17 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
self.transactions = []
def final_bounces(self, url):
"""Resolves redirect chains in self.transactions, returns a list of
"""
Resolves redirect chains in self.transactions, returns a list of
Transaction representing the final redirect destinations of the given
url. There could be more than one if for example youtube-dl hit the
same url with HEAD and then GET requests."""
same url with HEAD and then GET requests.
"""
redirects = {}
for txn in self.transactions:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if ((txn.request.full_url == url
or txn.request.full_url in redirects)
and 'location' in txn.response.headers):
if 'location' in txn.response.headers:
redirects[txn.request.full_url] = txn
final_url = url

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b7.dev118',
version='1.1b7.dev121',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',