mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
fix issue where use of YoutubeDLSpy caused youtube-dl connections to remote servers to be kept open
This commit is contained in:
parent
b4f19e2594
commit
a1f1681cad
@ -47,14 +47,19 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
|
|||||||
return req
|
return req
|
||||||
|
|
||||||
class YoutubeDLSpy(urllib.request.BaseHandler):
|
class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
Transaction = collections.namedtuple('Transaction', ['request', 'response'])
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def _http_response(self, request, response):
|
def _http_response(self, request, response):
|
||||||
self.transactions.append(YoutubeDLSpy.Transaction(request, response))
|
txn = {
|
||||||
|
'url': request.full_url,
|
||||||
|
'method': request.get_method(),
|
||||||
|
'status_code': response.code,
|
||||||
|
'response_headers': response.headers,
|
||||||
|
}
|
||||||
|
self.transactions.append(txn)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
http_response = https_response = _http_response
|
http_response = https_response = _http_response
|
||||||
@ -73,16 +78,16 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
|||||||
for txn in self.transactions:
|
for txn in self.transactions:
|
||||||
# XXX check http status 301,302,303,307? check for "uri" header
|
# XXX check http status 301,302,303,307? check for "uri" header
|
||||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||||
if 'location' in txn.response.headers:
|
if 'location' in txn['response_headers']:
|
||||||
redirects[txn.request.full_url] = txn
|
redirects[txn['url']] = txn
|
||||||
|
|
||||||
final_url = url
|
final_url = url
|
||||||
while final_url in redirects:
|
while final_url in redirects:
|
||||||
final_url = redirects.pop(final_url).response.headers['location']
|
final_url = redirects.pop(final_url)['response_headers']['location']
|
||||||
|
|
||||||
final_bounces = []
|
final_bounces = []
|
||||||
for txn in self.transactions:
|
for txn in self.transactions:
|
||||||
if txn.request.full_url == final_url:
|
if txn['url'] == final_url:
|
||||||
final_bounces.append(txn)
|
final_bounces.append(txn)
|
||||||
|
|
||||||
return final_bounces
|
return final_bounces
|
||||||
@ -316,15 +321,14 @@ class BrozzlerWorker:
|
|||||||
if not final_bounces:
|
if not final_bounces:
|
||||||
return True
|
return True
|
||||||
for txn in final_bounces:
|
for txn in final_bounces:
|
||||||
if txn.response.headers.get_content_type() in [
|
if txn['response_headers'].get_content_type() in [
|
||||||
'text/html', 'application/xhtml+xml']:
|
'text/html', 'application/xhtml+xml']:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _already_fetched(self, page, brozzler_spy):
|
def _already_fetched(self, page, brozzler_spy):
|
||||||
for txn in brozzler_spy.final_bounces(page.url):
|
for txn in brozzler_spy.final_bounces(page.url):
|
||||||
if (txn.request.get_method() == 'GET'
|
if (txn['method'] == 'GET' and txn['status_code'] == 200):
|
||||||
and txn.response.getcode() == 200):
|
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev195',
|
version='1.1b9.dev196',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user