diff --git a/brozzler/robots.py b/brozzler/robots.py index ef69ab9..046ef22 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -101,14 +101,14 @@ def is_permitted_by_robots(site, url, proxy=None): result = _robots_cache(site, proxy).allowed( url, site.user_agent or "brozzler") return result - except BaseException as e: - if (isinstance(e, reppy.exceptions.ServerError) - and isinstance(e.args[0], brozzler.ReachedLimit)) or ( - isinstance(e, reppy.exceptions.ConnectionException) - and isinstance( - e.args[0], requests.exceptions.ProxyError)): - # reppy has wrapped an exception that we want to bubble up + except Exception as e: + if isinstance(e, reppy.exceptions.ServerError) and isinstance( + e.args[0], brozzler.ReachedLimit): raise e.args[0] + elif hasattr(e, 'args') and isinstance( + e.args[0], requests.exceptions.ProxyError): + # reppy has wrapped an exception that we want to bubble up + raise brozzler.ProxyError(e) else: if tries_left > 0: logging.warn( diff --git a/brozzler/worker.py b/brozzler/worker.py index 30b5e41..1aef258 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -251,10 +251,17 @@ class BrozzlerWorker: except BaseException as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass - elif (hasattr(e, "exc_info") and e.exc_info[0] == - urllib.error.HTTPError and hasattr(e.exc_info[1], "code") + elif (hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.HTTPError + and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) + elif (hasattr(e, 'exc_info') + and e.exc_info[0] == urllib.error.URLError + and self._proxy_for(site)): + # connection problem when using a proxy == proxy error (XXX?) + raise brozzler.ProxyError( + 'youtube-dl hit apparent proxy error', e) else: raise @@ -285,6 +292,8 @@ class BrozzlerWorker: raise except brozzler.ShutdownRequested: raise + except brozzler.ProxyError: + raise except Exception as e: if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 and hasattr(e.exc_info[1], 'code') @@ -294,7 +303,7 @@ class BrozzlerWorker: e.exc_info[1].code, e.exc_info[1].msg, page.url) else: self.logger.error( - "youtube_dl raised exception on %s", page, + 'youtube_dl raised exception on %s', page, exc_info=True) if self._needs_browsing(page, ydl_spy): @@ -379,10 +388,13 @@ class BrozzlerWorker: } self.logger.info('fetching %s', page) - # response is ignored - requests.get( - page.url, proxies=proxies, headers=site.extra_headers(), - verify=False) + try: + # response is ignored + requests.get( + page.url, proxies=proxies, headers=site.extra_headers(), + verify=False) + except requests.exceptions.ProxyError as e: + raise brozzler.ProxyError(e) def _needs_browsing(self, page, brozzler_spy): final_bounces = brozzler_spy.final_bounces(page.url) diff --git a/setup.py b/setup.py index 271df9f..d3df130 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b11.dev232', + version='1.1b11.dev233', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py index ca0f529..142a664 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -23,11 +23,11 @@ import threading import os import brozzler import brozzler.chrome -import socket import logging import yaml import datetime import requests +import tempfile @pytest.fixture(scope='module') def httpd(request): @@ -108,18 +108,41 @@ blocks: assert site.is_in_scope( 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) -def test_robots_proxy_down(httpd): +def test_proxy_down(): ''' - Test that exception fetching robots.txt bubbles up if proxy is down. - ''' - url = 'http://localhost:%s/' % httpd.server_port - site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'}) + Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. - sock = socket.socket() - sock.bind(('127.0.0.1', 0)) - not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1] - with pytest.raises(requests.exceptions.ProxyError): - brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy) + This test needs to cover every possible fetch through the proxy other than + fetches from the browser. For that, see test_brozzling.py. + ''' + # nobody listens on port 4 :) + not_listening_proxy = '127.0.0.1:4' + + ### binding and not listening produces another type of connection + ### error, which we could test, but it takes a while + # sock = socket.socket() + # sock.bind(('127.0.0.1', 0)) + # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1] + + worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) + + site = brozzler.Site(None, {'seed':'http://example.com/'}) + page = brozzler.Page(None, {'url': 'http://example.com/'}) + + # robots.txt fetch + with pytest.raises(brozzler.ProxyError): + brozzler.is_permitted_by_robots( + site, 'http://example.com/', proxy=not_listening_proxy) + + # youtube-dl fetch + with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: + ydl = worker._youtube_dl(tempdir, site) + with pytest.raises(brozzler.ProxyError): + worker._try_youtube_dl(ydl, site, page) + + # raw fetch + with pytest.raises(brozzler.ProxyError): + worker._fetch_url(site, page) def test_start_stop_backwards_compat(): site = brozzler.Site(None, {'seed': 'http://example.com/'})