From 953e50d9a6b2f87430c01b4bb11afb21f12d996b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 9 Nov 2016 13:12:14 -0800 Subject: [PATCH 1/3] fix bug in final_bounces (not sure what I was thinking) --- brozzler/worker.py | 11 ++++++----- setup.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index ea6b5ec..c306d95 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -48,6 +48,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler): class YoutubeDLSpy(urllib.request.BaseHandler): Transaction = collections.namedtuple('Transaction', ['request', 'response']) + logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self): self.reset() @@ -62,17 +63,17 @@ class YoutubeDLSpy(urllib.request.BaseHandler): self.transactions = [] def final_bounces(self, url): - """Resolves redirect chains in self.transactions, returns a list of + """ + Resolves redirect chains in self.transactions, returns a list of Transaction representing the final redirect destinations of the given url. There could be more than one if for example youtube-dl hit the - same url with HEAD and then GET requests.""" + same url with HEAD and then GET requests. + """ redirects = {} for txn in self.transactions: # XXX check http status 301,302,303,307? check for "uri" header # as well as "location"? see urllib.request.HTTPRedirectHandler - if ((txn.request.full_url == url - or txn.request.full_url in redirects) - and 'location' in txn.response.headers): + if 'location' in txn.response.headers: redirects[txn.request.full_url] = txn final_url = url diff --git a/setup.py b/setup.py index 77b93b1..e25c755 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b7.dev118', + version='1.1b7.dev119', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 8e115b44fab640a36f7ca38df2c1173b1fe0cd71 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 9 Nov 2016 13:12:36 -0800 Subject: [PATCH 2/3] add --behavior-parameters argument to brozzler-new-site --- brozzler/cli.py | 12 ++++++++++-- setup.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 88c92cf..a57396f 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -224,6 +224,12 @@ def brozzler_new_site(): 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) + arg_parser.add_argument( + '--behavior-parameters', dest='behavior_parameters', + default=None, help=( + 'json blob of parameters to populate the javascript behavior ' + 'template, e.g. {"parameter_username":"x",' + '"parameter_password":"y"}')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) @@ -234,8 +240,10 @@ def brozzler_new_site(): time_limit=int(args.time_limit) if args.time_limit else None, ignore_robots=args.ignore_robots, enable_warcprox_features=args.enable_warcprox_features, - warcprox_meta=( - json.loads(args.warcprox_meta) if args.warcprox_meta else None)) + warcprox_meta=json.loads( + args.warcprox_meta) if args.warcprox_meta else None, + behavior_parameters=json.loads( + args.behavior_parameters) if args.behavior_parameters else None) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) diff --git a/setup.py b/setup.py index e25c755..3b5fa87 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b7.dev119', + version='1.1b7.dev120', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 02bf23059e090bfbfb88deecd70cef7d7b752552 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 9 Nov 2016 13:43:10 -0800 Subject: [PATCH 3/3] pass behavior_parameters from job configuration into Site objects --- brozzler/job.py | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/brozzler/job.py b/brozzler/job.py index f6b5f92..a213eae 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -88,7 +88,8 @@ def new_job(frontier, job_conf): warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata"), remember_outlinks=merged_conf.get("remember_outlinks"), - user_agent=merged_conf.get("user_agent")) + user_agent=merged_conf.get("user_agent"), + behavior_parameters=merged_conf.get("behavior_parameters")) sites.append(site) # insert all the sites into database before the job diff --git a/setup.py b/setup.py index 3b5fa87..c55b580 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b7.dev120', + version='1.1b7.dev121', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',