mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
bubble up proxy errors fetching robots.txt, with unit test, and documentation
This commit is contained in:
parent
df7734f2ca
commit
0884b4cd56
@ -73,6 +73,25 @@ def _robots_cache(site, proxy=None):
|
|||||||
return _robots_caches[site.id]
|
return _robots_caches[site.id]
|
||||||
|
|
||||||
def is_permitted_by_robots(site, url, proxy=None):
|
def is_permitted_by_robots(site, url, proxy=None):
|
||||||
|
'''
|
||||||
|
Checks if `url` is permitted by robots.txt.
|
||||||
|
|
||||||
|
In case of problems fetching robots.txt, different things can happen.
|
||||||
|
Reppy (the robots.txt parsing library) handles some exceptions internally
|
||||||
|
and applies an appropriate policy. It bubbles up other exceptions. Of
|
||||||
|
these, there are two kinds that this function raises for the caller to
|
||||||
|
handle, described below. Yet other types of exceptions are caught, and the
|
||||||
|
fetch is retried up to 10 times. In this case, after the 10th failure, the
|
||||||
|
function returns `False` (i.e. forbidden by robots).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
|
||||||
|
by robots.txt, `False` otherwise
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
|
||||||
|
requests.exceptions.ProxyError: if the proxy is down
|
||||||
|
'''
|
||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -83,13 +102,22 @@ def is_permitted_by_robots(site, url, proxy=None):
|
|||||||
url, site.user_agent or "brozzler")
|
url, site.user_agent or "brozzler")
|
||||||
return result
|
return result
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
if (isinstance(e, reppy.exceptions.ServerError)
|
||||||
|
and isinstance(e.args[0], brozzler.ReachedLimit)) or (
|
||||||
|
isinstance(e, reppy.exceptions.ConnectionException)
|
||||||
|
and isinstance(
|
||||||
|
e.args[0], requests.exceptions.ProxyError)):
|
||||||
|
# reppy has wrapped an exception that we want to bubble up
|
||||||
raise e.args[0]
|
raise e.args[0]
|
||||||
else:
|
else:
|
||||||
if tries_left > 0:
|
if tries_left > 0:
|
||||||
logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e))
|
logging.warn(
|
||||||
|
"caught exception fetching robots.txt (%s tries "
|
||||||
|
"left) for %s: %s", tries_left, url, repr(e))
|
||||||
tries_left -= 1
|
tries_left -= 1
|
||||||
else:
|
else:
|
||||||
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
|
logging.error(
|
||||||
|
"caught exception fetching robots.txt (0 tries "
|
||||||
|
"left) for %s: %s", url, repr(e), exc_info=True)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -27,6 +27,7 @@ import socket
|
|||||||
import logging
|
import logging
|
||||||
import yaml
|
import yaml
|
||||||
import datetime
|
import datetime
|
||||||
|
import requests
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
@ -107,6 +108,19 @@ blocks:
|
|||||||
assert site.is_in_scope(
|
assert site.is_in_scope(
|
||||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
||||||
|
|
||||||
|
def test_robots_proxy_down(httpd):
|
||||||
|
'''
|
||||||
|
Test that exception fetching robots.txt bubbles up if proxy is down.
|
||||||
|
'''
|
||||||
|
url = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
|
||||||
|
|
||||||
|
sock = socket.socket()
|
||||||
|
sock.bind(('127.0.0.1', 0))
|
||||||
|
not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
||||||
|
with pytest.raises(requests.exceptions.ProxyError):
|
||||||
|
brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy)
|
||||||
|
|
||||||
def test_start_stop_backwards_compat():
|
def test_start_stop_backwards_compat():
|
||||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
||||||
assert len(site.starts_and_stops) == 1
|
assert len(site.starts_and_stops) == 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user