diff --git a/brozzler/browser.py b/brozzler/browser.py index 9765ab1..214ea4f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -119,6 +119,7 @@ class WebsockReceiverThread(threading.Thread): self.is_open = False self.got_page_load_event = None + self.reached_limit = None self.on_request = None self.on_response = None @@ -182,15 +183,22 @@ class WebsockReceiverThread(threading.Thread): self.websock.send(json.dumps(dict(id=0, method='Debugger.resume'))) def _network_response_received(self, message): - # if (not self._reached_limit - # and message['params']['response']['status'] == 420 - # and 'Warcprox-Meta' in CaseInsensitiveDict( - # message['params']['response']['headers'])): - # warcprox_meta = json.loads(CaseInsensitiveDict( - # message['params']['response']['headers'])['Warcprox-Meta']) - # self._reached_limit = brozzler.ReachedLimit( - # warcprox_meta=warcprox_meta) - # self.logger.info('reached limit %s', self._reached_limit) + if (message['params']['response']['status'] == 420 + and 'Warcprox-Meta' in CaseInsensitiveDict( + message['params']['response']['headers'])): + if not self.reached_limit: + warcprox_meta = json.loads(CaseInsensitiveDict( + message['params']['response']['headers'])['Warcprox-Meta']) + self.reached_limit = brozzler.ReachedLimit( + warcprox_meta=warcprox_meta) + self.logger.info('reached limit %s', self.reached_limit) + brozzler.thread_raise( + self.calling_thread, brozzler.ReachedLimit) + else: + self.logger.info( + 'reached limit but self.reached_limit is already set, ' + 'assuming the calling thread is already handling this', + self.reached_limit) if self.on_response: self.on_response(message) @@ -422,6 +430,10 @@ class Browser: ## outlinks += retrieve_outlinks (60 sec) final_page_url = self.url() return final_page_url, outlinks + except brozzler.ReachedLimit: + # websock_thread has stashed the ReachedLimit exception with + # more information, raise that one + raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) diff --git a/setup.py b/setup.py index c464b8a..8f88b8c 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev155', + version='1.1b9.dev156', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 870ff02..563f106 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -25,18 +25,44 @@ import http.server import threading import argparse import urllib +import json args = argparse.Namespace() args.log_level = logging.INFO brozzler.cli._configure_logging(args) +WARCPROX_META_420 = { + 'stats': { + 'test_limits_bucket': { + 'total': {'urls': 0, 'wire_bytes': 0}, + 'new': {'urls': 0, 'wire_bytes': 0}, + 'revisit': {'urls': 0, 'wire_bytes': 0}, + 'bucket': 'test_limits_bucket' + } + }, + 'reached-limit': {'test_limits_bucket/total/urls': 0} +} + @pytest.fixture(scope='module') def httpd(request): + class RequestHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + if self.path == '/420': + self.send_response(420, 'Reached limit') + self.send_header('Connection', 'close') + self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420)) + payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n' + self.send_header('Content-Type', 'text/plain;charset=utf-8') + self.send_header('Content-Length', len(payload)) + self.end_headers() + self.wfile.write(payload) + else: + super().do_GET() + # SimpleHTTPRequestHandler always uses CWD so we have to chdir os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) - httpd = http.server.HTTPServer( - ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() @@ -68,6 +94,11 @@ def test_httpd(httpd): assert payload1 == payload2 + url = 'http://localhost:%s/420' % httpd.server_port + with pytest.raises(urllib.error.HTTPError) as excinfo: + urllib.request.urlopen(url) + assert excinfo.value.getcode() == 420 + def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: @@ -88,3 +119,10 @@ def test_on_response(httpd): assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port +def test_420(httpd): + chrome_exe = brozzler.suggest_default_chrome_exe() + url = 'http://localhost:%s/420' % httpd.server_port + with brozzler.Browser(chrome_exe=chrome_exe) as browser: + with pytest.raises(brozzler.ReachedLimit) as excinfo: + browser.browse_page(url) + assert excinfo.value.warcprox_meta == WARCPROX_META_420