mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into fix-travis
* master: pass behavior_parameters from job configuration into Site objects add --behavior-parameters argument to brozzler-new-site fix bug in final_bounces (not sure what I was thinking)
This commit is contained in:
commit
119fa5ae5d
@ -224,6 +224,12 @@ def brozzler_new_site():
|
||||
'Warcprox-Meta http request header to send with each request; '
|
||||
'must be a json blob, ignored unless warcprox features are '
|
||||
'enabled'))
|
||||
arg_parser.add_argument(
|
||||
'--behavior-parameters', dest='behavior_parameters',
|
||||
default=None, help=(
|
||||
'json blob of parameters to populate the javascript behavior '
|
||||
'template, e.g. {"parameter_username":"x",'
|
||||
'"parameter_password":"y"}'))
|
||||
_add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
@ -234,8 +240,10 @@ def brozzler_new_site():
|
||||
time_limit=int(args.time_limit) if args.time_limit else None,
|
||||
ignore_robots=args.ignore_robots,
|
||||
enable_warcprox_features=args.enable_warcprox_features,
|
||||
warcprox_meta=(
|
||||
json.loads(args.warcprox_meta) if args.warcprox_meta else None))
|
||||
warcprox_meta=json.loads(
|
||||
args.warcprox_meta) if args.warcprox_meta else None,
|
||||
behavior_parameters=json.loads(
|
||||
args.behavior_parameters) if args.behavior_parameters else None)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
|
@ -88,7 +88,8 @@ def new_job(frontier, job_conf):
|
||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||
metadata=merged_conf.get("metadata"),
|
||||
remember_outlinks=merged_conf.get("remember_outlinks"),
|
||||
user_agent=merged_conf.get("user_agent"))
|
||||
user_agent=merged_conf.get("user_agent"),
|
||||
behavior_parameters=merged_conf.get("behavior_parameters"))
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
|
@ -48,6 +48,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
|
||||
class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
Transaction = collections.namedtuple('Transaction', ['request', 'response'])
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
@ -62,17 +63,17 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
self.transactions = []
|
||||
|
||||
def final_bounces(self, url):
|
||||
"""Resolves redirect chains in self.transactions, returns a list of
|
||||
"""
|
||||
Resolves redirect chains in self.transactions, returns a list of
|
||||
Transaction representing the final redirect destinations of the given
|
||||
url. There could be more than one if for example youtube-dl hit the
|
||||
same url with HEAD and then GET requests."""
|
||||
same url with HEAD and then GET requests.
|
||||
"""
|
||||
redirects = {}
|
||||
for txn in self.transactions:
|
||||
# XXX check http status 301,302,303,307? check for "uri" header
|
||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||
if ((txn.request.full_url == url
|
||||
or txn.request.full_url in redirects)
|
||||
and 'location' in txn.response.headers):
|
||||
if 'location' in txn.response.headers:
|
||||
redirects[txn.request.full_url] = txn
|
||||
|
||||
final_url = url
|
||||
|
Loading…
x
Reference in New Issue
Block a user