From 349b41ab32dd41aaad8a58ba2153c197c6c80ab4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 17 Apr 2017 18:14:02 -0700 Subject: [PATCH] raise new exception brozzler.ProxyError in case of proxy error browsing a page --- brozzler/__init__.py | 3 +++ brozzler/browser.py | 7 ++++++- setup.py | 2 +- tests/test_brozzling.py | 26 ++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 3443872..6a4303e 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -29,6 +29,9 @@ class NothingToClaim(Exception): class CrawlStopped(Exception): pass +class ProxyError(Exception): + pass + class ReachedLimit(Exception): def __init__(self, http_error=None, warcprox_meta=None, http_payload=None): import json diff --git a/brozzler/browser.py b/brozzler/browser.py index e5b236f..bd705f0 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -239,6 +239,10 @@ class WebsockReceiverThread(threading.Thread): message['params']['message']['text']) elif message['method'] == 'Page.javascriptDialogOpening': self._javascript_dialog_opening(message) + elif (message['method'] == 'Network.loadingFailed' + and 'params' in message and 'errorText' in message['params'] + and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'): + brozzler.thread_raise(self.calling_thread, brozzler.ProxyError) # else: # self.logger.debug("%s %s", message["method"], json_message) elif 'result' in message: @@ -411,7 +415,8 @@ class Browser: outlinks: a list of navigational links extracted from the page Raises: - BrowsingException: if browsing the page fails + brozzler.ProxyError: in case of proxy connection error + BrowsingException: if browsing the page fails in some other way ''' if not self.is_running(): raise BrowsingException('browser has not been started') diff --git a/setup.py b/setup.py index 6df17b6..271df9f 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b11.dev231', + version='1.1b11.dev232', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index c43c2f1..3253954 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -26,6 +26,7 @@ import threading import argparse import urllib import json +import threading args = argparse.Namespace() args.log_level = logging.INFO @@ -186,3 +187,28 @@ def test_extract_outlinks(httpd): 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port } +def test_proxy_down(): + ''' + Test that browsing raises `brozzler.ProxyError` when proxy is down. + + See also `test_proxy_down` in test_units.py. + ''' + site = brozzler.Site(None, {'seed':'http://example.com/'}) + page = brozzler.Page(None, {'url': 'http://example.com/'}) + + # nobody listens on port 4 :) + not_listening_proxy = '127.0.0.1:4' + + ### binding and not listening produces another type of connection + ### error, which we could test, but it takes a while + # sock = socket.socket() + # sock.bind(('127.0.0.1', 0)) + # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1] + + worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) + chrome_exe = brozzler.suggest_default_chrome_exe() + + with brozzler.Browser(chrome_exe=chrome_exe) as browser: + with pytest.raises(brozzler.ProxyError): + worker.brozzle_page(browser, site, page) +