treat any error fetching robots.txt as "allow all"

2025-08-21 20:38:20 -04:00 · 2018-06-22 14:50:57 -05:00 · 2018-06-22 14:50:57 -05:00 · aeb7c3f825
commit aeb7c3f825
parent f5f9a1a137
2 changed files with 110 additions and 44 deletions
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent):
    return self.agents.get('*')
 reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
 class _SessionRaiseOn420(requests.Session):
    timeout = 60
    def get(self, url, *args, **kwargs):
        res = super().get(url, timeout=self.timeout, *args, **kwargs)
        if res.status_code == 420 and 'warcprox-meta' in res.headers:
            raise brozzler.ReachedLimit(
                    warcprox_meta=json.loads(res.headers['warcprox-meta']),
                    http_payload=res.text)
        else:
            return res
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site, proxy=None):
    class SessionRaiseOn420(requests.Session):
        def get(self, url, *args, **kwargs):
            res = super().get(url, *args, **kwargs)
            if res.status_code == 420 and 'warcprox-meta' in res.headers:
                raise brozzler.ReachedLimit(
                        warcprox_meta=json.loads(res.headers['warcprox-meta']),
                        http_payload=res.text)
            else:
                return res
    if not site.id in _robots_caches:
-        req_sesh = SessionRaiseOn420()
+        req_sesh = _SessionRaiseOn420()
        req_sesh.verify = False   # ignore cert errors
        if proxy:
            proxie = "http://%s" % proxy
@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None):
            req_sesh.headers.update(site.extra_headers())
        if site.user_agent:
            req_sesh.headers['User-Agent'] = site.user_agent
-        _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
+        _robots_caches[site.id] = reppy.cache.RobotsCache(
                session=req_sesh, disallow_forbidden=False)
    return _robots_caches[site.id]
@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.
-    In case of problems fetching robots.txt, different things can happen.
+    Treats any kind of error fetching robots.txt as "allow all". See
-    Reppy (the robots.txt parsing library) handles some exceptions internally
+    http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
-    and applies an appropriate policy. It bubbles up other exceptions. Of
+    for some background on that policy.
    these, there are two kinds that this function raises for the caller to
    handle, described below. Yet other types of exceptions are caught, and the
    fetch is retried up to 10 times. In this case, after the 10th failure, the
    function returns `False` (i.e. forbidden by robots).
    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None):
    if site.ignore_robots:
        return True
-    tries_left = 10
+    try:
-    while True:
+        result = _robots_cache(site, proxy).allowed(
-        try:
+                url, site.user_agent or "brozzler")
-            result = _robots_cache(site, proxy).allowed(
+        return result
-                    url, site.user_agent or "brozzler")
+    except Exception as e:
-            return result
+        if isinstance(e, reppy.exceptions.ServerError) and isinstance(
-        except Exception as e:
+                e.args[0], brozzler.ReachedLimit):
-            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
+            raise e.args[0]
-                    e.args[0], brozzler.ReachedLimit):
+        elif hasattr(e, 'args') and isinstance(
-                raise e.args[0]
+                e.args[0], requests.exceptions.ProxyError):
-            elif hasattr(e, 'args') and isinstance(
+            # reppy has wrapped an exception that we want to bubble up
-                    e.args[0], requests.exceptions.ProxyError):
+            raise brozzler.ProxyError(e)
-                # reppy has wrapped an exception that we want to bubble up
+        else:
-                raise brozzler.ProxyError(e)
+            logging.warn(
-            else:
+                    "returning true (permitted) after problem fetching "
-                if tries_left > 0:
+                    "robots.txt for %r: %r", url, e)
-                    logging.warn(
+            return True
                            "caught exception fetching robots.txt (%r tries "
                            "left) for %r: %r", tries_left, url, e)
                    tries_left -= 1
                else:
                    logging.error(
                            "caught exception fetching robots.txt (0 tries "
                            "left) for %r: %r", url, e, exc_info=True)
                    return False
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -32,6 +32,7 @@ import uuid
 import socket
 import time
 import sys
 import threading
 logging.basicConfig(
        stream=sys.stderr, level=logging.INFO, format=(
@ -67,6 +68,81 @@ def test_robots(httpd):
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
    assert not brozzler.is_permitted_by_robots(site, url)
 def test_robots_http_statuses():
    for status in (
            200, 204, 400, 401, 402, 403, 404, 405,
            500, 501, 502, 503, 504, 505):
        class Handler(http.server.BaseHTTPRequestHandler):
            def do_GET(self):
                response = (b'HTTP/1.1 %s Meaningless message\r\n'
                          + b'Content-length: 0\r\n'
                          + b'\r\n') % status
                self.connection.sendall(response)
                # self.send_response(status)
                # self.end_headers()
        httpd = http.server.HTTPServer(('localhost', 0), Handler)
        httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
        httpd_thread.start()
        try:
            url = 'http://localhost:%s/' % httpd.server_port
            site = brozzler.Site(None, {'seed': url})
            assert brozzler.is_permitted_by_robots(site, url)
        finally:
            httpd.shutdown()
            httpd.server_close()
            httpd_thread.join()
 def test_robots_empty_respone():
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            self.connection.shutdown(socket.SHUT_RDWR)
            self.connection.close()
    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()
    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
 def test_robots_socket_timeout():
    stop_hanging = threading.Event()
    class Handler(http.server.BaseHTTPRequestHandler):
        def do_GET(self):
            stop_hanging.wait(60)
            self.connection.sendall(
                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
    httpd = http.server.HTTPServer(('localhost', 0), Handler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()
    try:
        url = 'http://localhost:%s/' % httpd.server_port
        site = brozzler.Site(None, {'seed': url})
        brozzler.robots._SessionRaiseOn420.timeout = 2
        assert brozzler.is_permitted_by_robots(site, url)
    finally:
        brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
        stop_hanging.set()
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
 def test_robots_dns_failure():
    # .invalid. is guaranteed nonexistent per rfc 6761
    url = 'http://whatever.invalid./'
    site = brozzler.Site(None, {'seed': url})
    assert brozzler.is_permitted_by_robots(site, url)
 def test_scoping():
    test_scope = yaml.load('''
 max_hops: 100