mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 04:44:12 -04:00
restore handling of 420 Reached limit, with a rudimentary test
This commit is contained in:
parent
e5fb6cb4b9
commit
70b67942a5
3 changed files with 62 additions and 12 deletions
|
@ -119,6 +119,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
|
|
||||||
self.is_open = False
|
self.is_open = False
|
||||||
self.got_page_load_event = None
|
self.got_page_load_event = None
|
||||||
|
self.reached_limit = None
|
||||||
|
|
||||||
self.on_request = None
|
self.on_request = None
|
||||||
self.on_response = None
|
self.on_response = None
|
||||||
|
@ -182,15 +183,22 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
|
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
|
||||||
|
|
||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
# if (not self._reached_limit
|
if (message['params']['response']['status'] == 420
|
||||||
# and message['params']['response']['status'] == 420
|
and 'Warcprox-Meta' in CaseInsensitiveDict(
|
||||||
# and 'Warcprox-Meta' in CaseInsensitiveDict(
|
message['params']['response']['headers'])):
|
||||||
# message['params']['response']['headers'])):
|
if not self.reached_limit:
|
||||||
# warcprox_meta = json.loads(CaseInsensitiveDict(
|
warcprox_meta = json.loads(CaseInsensitiveDict(
|
||||||
# message['params']['response']['headers'])['Warcprox-Meta'])
|
message['params']['response']['headers'])['Warcprox-Meta'])
|
||||||
# self._reached_limit = brozzler.ReachedLimit(
|
self.reached_limit = brozzler.ReachedLimit(
|
||||||
# warcprox_meta=warcprox_meta)
|
warcprox_meta=warcprox_meta)
|
||||||
# self.logger.info('reached limit %s', self._reached_limit)
|
self.logger.info('reached limit %s', self.reached_limit)
|
||||||
|
brozzler.thread_raise(
|
||||||
|
self.calling_thread, brozzler.ReachedLimit)
|
||||||
|
else:
|
||||||
|
self.logger.info(
|
||||||
|
'reached limit but self.reached_limit is already set, '
|
||||||
|
'assuming the calling thread is already handling this',
|
||||||
|
self.reached_limit)
|
||||||
if self.on_response:
|
if self.on_response:
|
||||||
self.on_response(message)
|
self.on_response(message)
|
||||||
|
|
||||||
|
@ -422,6 +430,10 @@ class Browser:
|
||||||
## outlinks += retrieve_outlinks (60 sec)
|
## outlinks += retrieve_outlinks (60 sec)
|
||||||
final_page_url = self.url()
|
final_page_url = self.url()
|
||||||
return final_page_url, outlinks
|
return final_page_url, outlinks
|
||||||
|
except brozzler.ReachedLimit:
|
||||||
|
# websock_thread has stashed the ReachedLimit exception with
|
||||||
|
# more information, raise that one
|
||||||
|
raise self.websock_thread.reached_limit
|
||||||
except websocket.WebSocketConnectionClosedException as e:
|
except websocket.WebSocketConnectionClosedException as e:
|
||||||
self.logger.error('websocket closed, did chrome die?')
|
self.logger.error('websocket closed, did chrome die?')
|
||||||
raise BrowsingException(e)
|
raise BrowsingException(e)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev155',
|
version='1.1b9.dev156',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -25,18 +25,44 @@ import http.server
|
||||||
import threading
|
import threading
|
||||||
import argparse
|
import argparse
|
||||||
import urllib
|
import urllib
|
||||||
|
import json
|
||||||
|
|
||||||
args = argparse.Namespace()
|
args = argparse.Namespace()
|
||||||
args.log_level = logging.INFO
|
args.log_level = logging.INFO
|
||||||
brozzler.cli._configure_logging(args)
|
brozzler.cli._configure_logging(args)
|
||||||
|
|
||||||
|
WARCPROX_META_420 = {
|
||||||
|
'stats': {
|
||||||
|
'test_limits_bucket': {
|
||||||
|
'total': {'urls': 0, 'wire_bytes': 0},
|
||||||
|
'new': {'urls': 0, 'wire_bytes': 0},
|
||||||
|
'revisit': {'urls': 0, 'wire_bytes': 0},
|
||||||
|
'bucket': 'test_limits_bucket'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'reached-limit': {'test_limits_bucket/total/urls': 0}
|
||||||
|
}
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
|
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
if self.path == '/420':
|
||||||
|
self.send_response(420, 'Reached limit')
|
||||||
|
self.send_header('Connection', 'close')
|
||||||
|
self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
|
||||||
|
payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
|
||||||
|
self.send_header('Content-Type', 'text/plain;charset=utf-8')
|
||||||
|
self.send_header('Content-Length', len(payload))
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(payload)
|
||||||
|
else:
|
||||||
|
super().do_GET()
|
||||||
|
|
||||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||||
|
|
||||||
httpd = http.server.HTTPServer(
|
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
|
||||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
|
@ -68,6 +94,11 @@ def test_httpd(httpd):
|
||||||
|
|
||||||
assert payload1 == payload2
|
assert payload1 == payload2
|
||||||
|
|
||||||
|
url = 'http://localhost:%s/420' % httpd.server_port
|
||||||
|
with pytest.raises(urllib.error.HTTPError) as excinfo:
|
||||||
|
urllib.request.urlopen(url)
|
||||||
|
assert excinfo.value.getcode() == 420
|
||||||
|
|
||||||
def test_aw_snap_hes_dead_jim():
|
def test_aw_snap_hes_dead_jim():
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
|
@ -88,3 +119,10 @@ def test_on_response(httpd):
|
||||||
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
|
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
|
||||||
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
|
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
|
||||||
|
|
||||||
|
def test_420(httpd):
|
||||||
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
|
url = 'http://localhost:%s/420' % httpd.server_port
|
||||||
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
|
with pytest.raises(brozzler.ReachedLimit) as excinfo:
|
||||||
|
browser.browse_page(url)
|
||||||
|
assert excinfo.value.warcprox_meta == WARCPROX_META_420
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue