bubble up proxy errors fetching robots.txt, with unit test, and documentation

This commit is contained in:
Noah Levitt 2017-04-17 16:47:05 -07:00
parent df7734f2ca
commit 0884b4cd56
2 changed files with 45 additions and 3 deletions

View File

@ -73,6 +73,25 @@ def _robots_cache(site, proxy=None):
return _robots_caches[site.id] return _robots_caches[site.id]
def is_permitted_by_robots(site, url, proxy=None): def is_permitted_by_robots(site, url, proxy=None):
'''
Checks if `url` is permitted by robots.txt.
In case of problems fetching robots.txt, different things can happen.
Reppy (the robots.txt parsing library) handles some exceptions internally
and applies an appropriate policy. It bubbles up other exceptions. Of
these, there are two kinds that this function raises for the caller to
handle, described below. Yet other types of exceptions are caught, and the
fetch is retried up to 10 times. In this case, after the 10th failure, the
function returns `False` (i.e. forbidden by robots).
Returns:
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
by robots.txt, `False` otherwise
Raises:
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
requests.exceptions.ProxyError: if the proxy is down
'''
if site.ignore_robots: if site.ignore_robots:
return True return True
@ -83,13 +102,22 @@ def is_permitted_by_robots(site, url, proxy=None):
url, site.user_agent or "brozzler") url, site.user_agent or "brozzler")
return result return result
except BaseException as e: except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): if (isinstance(e, reppy.exceptions.ServerError)
and isinstance(e.args[0], brozzler.ReachedLimit)) or (
isinstance(e, reppy.exceptions.ConnectionException)
and isinstance(
e.args[0], requests.exceptions.ProxyError)):
# reppy has wrapped an exception that we want to bubble up
raise e.args[0] raise e.args[0]
else: else:
if tries_left > 0: if tries_left > 0:
logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e)) logging.warn(
"caught exception fetching robots.txt (%s tries "
"left) for %s: %s", tries_left, url, repr(e))
tries_left -= 1 tries_left -= 1
else: else:
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True) logging.error(
"caught exception fetching robots.txt (0 tries "
"left) for %s: %s", url, repr(e), exc_info=True)
return False return False

View File

@ -27,6 +27,7 @@ import socket
import logging import logging
import yaml import yaml
import datetime import datetime
import requests
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
@ -107,6 +108,19 @@ blocks:
assert site.is_in_scope( assert site.is_in_scope(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
def test_robots_proxy_down(httpd):
'''
Test that exception fetching robots.txt bubbles up if proxy is down.
'''
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
sock = socket.socket()
sock.bind(('127.0.0.1', 0))
not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
with pytest.raises(requests.exceptions.ProxyError):
brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy)
def test_start_stop_backwards_compat(): def test_start_stop_backwards_compat():
site = brozzler.Site(None, {'seed': 'http://example.com/'}) site = brozzler.Site(None, {'seed': 'http://example.com/'})
assert len(site.starts_and_stops) == 1 assert len(site.starts_and_stops) == 1