bubble up proxy errors fetching robots.txt, with unit test, and documentation

2025-07-05 12:04:39 -04:00 · 2017-04-17 16:47:05 -07:00 · 2017-04-17 16:47:05 -07:00 · 0884b4cd56
commit 0884b4cd56
parent df7734f2ca
2 changed files with 45 additions and 3 deletions
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -73,6 +73,25 @@ def _robots_cache(site, proxy=None):
    return _robots_caches[site.id]
 def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.
    In case of problems fetching robots.txt, different things can happen.
    Reppy (the robots.txt parsing library) handles some exceptions internally
    and applies an appropriate policy. It bubbles up other exceptions. Of
    these, there are two kinds that this function raises for the caller to
    handle, described below. Yet other types of exceptions are caught, and the
    fetch is retried up to 10 times. In this case, after the 10th failure, the
    function returns `False` (i.e. forbidden by robots).
    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
            by robots.txt, `False` otherwise
    Raises:
        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
        requests.exceptions.ProxyError: if the proxy is down
    '''
    if site.ignore_robots:
        return True
@ -83,13 +102,22 @@ def is_permitted_by_robots(site, url, proxy=None):
                    url, site.user_agent or "brozzler")
            return result
        except BaseException as e:
-            if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
+            if (isinstance(e, reppy.exceptions.ServerError)
                    and isinstance(e.args[0], brozzler.ReachedLimit)) or (
                            isinstance(e, reppy.exceptions.ConnectionException)
                            and isinstance(
                                e.args[0], requests.exceptions.ProxyError)):
                # reppy has wrapped an exception that we want to bubble up
                raise e.args[0]
            else:
                if tries_left > 0:
-                    logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e))
+                    logging.warn(
                            "caught exception fetching robots.txt (%s tries "
                            "left) for %s: %s", tries_left, url, repr(e))
                    tries_left -= 1
                else:
-                    logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
+                    logging.error(
                            "caught exception fetching robots.txt (0 tries "
                            "left) for %s: %s", url, repr(e), exc_info=True)
                    return False
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -27,6 +27,7 @@ import socket
 import logging
 import yaml
 import datetime
 import requests
@pytest.fixture(scope='module')
 def httpd(request):
@ -107,6 +108,19 @@ blocks:
    assert site.is_in_scope(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
 def test_robots_proxy_down(httpd):
    '''
    Test that exception fetching robots.txt bubbles up if proxy is down.
    '''
    url = 'http://localhost:%s/' % httpd.server_port
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
    with pytest.raises(requests.exceptions.ProxyError):
        brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy)
 def test_start_stop_backwards_compat():
    site = brozzler.Site(None, {'seed': 'http://example.com/'})
    assert len(site.starts_and_stops) == 1