fix bug in final_bounces (not sure what I was thinking)

This commit is contained in:
Noah Levitt 2016-11-09 13:12:14 -08:00
parent 8889e4ab20
commit 953e50d9a6
2 changed files with 7 additions and 6 deletions

View file

@ -48,6 +48,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
class YoutubeDLSpy(urllib.request.BaseHandler): class YoutubeDLSpy(urllib.request.BaseHandler):
Transaction = collections.namedtuple('Transaction', ['request', 'response']) Transaction = collections.namedtuple('Transaction', ['request', 'response'])
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self): def __init__(self):
self.reset() self.reset()
@ -62,17 +63,17 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
self.transactions = [] self.transactions = []
def final_bounces(self, url): def final_bounces(self, url):
"""Resolves redirect chains in self.transactions, returns a list of """
Resolves redirect chains in self.transactions, returns a list of
Transaction representing the final redirect destinations of the given Transaction representing the final redirect destinations of the given
url. There could be more than one if for example youtube-dl hit the url. There could be more than one if for example youtube-dl hit the
same url with HEAD and then GET requests.""" same url with HEAD and then GET requests.
"""
redirects = {} redirects = {}
for txn in self.transactions: for txn in self.transactions:
# XXX check http status 301,302,303,307? check for "uri" header # XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler # as well as "location"? see urllib.request.HTTPRedirectHandler
if ((txn.request.full_url == url if 'location' in txn.response.headers:
or txn.request.full_url in redirects)
and 'location' in txn.response.headers):
redirects[txn.request.full_url] = txn redirects[txn.request.full_url] = txn
final_url = url final_url = url

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b7.dev118', version='1.1b7.dev119',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',