Merge pull request #110 from nlevitt/robots-errors

treat any error fetching robots.txt as "allow all"
2025-08-22 04:39:36 -04:00 · 2018-06-25 11:44:18 -05:00 · 2018-06-25 11:44:18 -05:00 · 05ec6a68b0
commit 05ec6a68b0
parent f5f9a1a137 d4db8ba9bc
3 changed files with 121 additions and 45 deletions
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent):
    return self.agents.get('*')
 reppy.parser.Rules.__getitem__ = _reppy_rules_getitem

+class _SessionRaiseOn420(requests.Session):
+    timeout = 60
+    def get(self, url, *args, **kwargs):
+        res = super().get(url, timeout=self.timeout, *args, **kwargs)
+        if res.status_code == 420 and 'warcprox-meta' in res.headers:
+            raise brozzler.ReachedLimit(
+                    warcprox_meta=json.loads(res.headers['warcprox-meta']),
+                    http_payload=res.text)
+        else:
+            return res
+
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site, proxy=None):
-    class SessionRaiseOn420(requests.Session):
-        def get(self, url, *args, **kwargs):
-            res = super().get(url, *args, **kwargs)
-            if res.status_code == 420 and 'warcprox-meta' in res.headers:
-                raise brozzler.ReachedLimit(
-                        warcprox_meta=json.loads(res.headers['warcprox-meta']),
-                        http_payload=res.text)
-            else:
-                return res
-
    if not site.id in _robots_caches:
-        req_sesh = SessionRaiseOn420()
+        req_sesh = _SessionRaiseOn420()
        req_sesh.verify = False   # ignore cert errors
        if proxy:
            proxie = "http://%s" % proxy
@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None):
            req_sesh.headers.update(site.extra_headers())
        if site.user_agent:
            req_sesh.headers['User-Agent'] = site.user_agent
-        _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
+        _robots_caches[site.id] = reppy.cache.RobotsCache(
+                session=req_sesh, disallow_forbidden=False)

    return _robots_caches[site.id]

@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.

-    In case of problems fetching robots.txt, different things can happen.
-    Reppy (the robots.txt parsing library) handles some exceptions internally
-    and applies an appropriate policy. It bubbles up other exceptions. Of
-    these, there are two kinds that this function raises for the caller to
-    handle, described below. Yet other types of exceptions are caught, and the
-    fetch is retried up to 10 times. In this case, after the 10th failure, the
-    function returns `False` (i.e. forbidden by robots).
+    Treats any kind of error fetching robots.txt as "allow all". See
+    http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
+    for some background on that policy.

    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None):
    if site.ignore_robots:
        return True

-    tries_left = 10
-    while True:
-        try:
-            result = _robots_cache(site, proxy).allowed(
-                    url, site.user_agent or "brozzler")
-            return result
-        except Exception as e:
-            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
-                    e.args[0], brozzler.ReachedLimit):
-                raise e.args[0]
-            elif hasattr(e, 'args') and isinstance(
-                    e.args[0], requests.exceptions.ProxyError):
-                # reppy has wrapped an exception that we want to bubble up
-                raise brozzler.ProxyError(e)
-            else:
-                if tries_left > 0:
-                    logging.warn(
-                            "caught exception fetching robots.txt (%r tries "
-                            "left) for %r: %r", tries_left, url, e)
-                    tries_left -= 1
-                else:
-                    logging.error(
-                            "caught exception fetching robots.txt (0 tries "
-                            "left) for %r: %r", url, e, exc_info=True)
-                    return False
+    try:
+        result = _robots_cache(site, proxy).allowed(
+                url, site.user_agent or "brozzler")
+        return result
+    except Exception as e:
+        if isinstance(e, reppy.exceptions.ServerError) and isinstance(
+                e.args[0], brozzler.ReachedLimit):
+            raise e.args[0]
+        elif hasattr(e, 'args') and isinstance(
+                e.args[0], requests.exceptions.ProxyError):
+            # reppy has wrapped an exception that we want to bubble up
+            raise brozzler.ProxyError(e)
+        else:
+            logging.warn(
+                    "returning true (permitted) after problem fetching "
+                    "robots.txt for %r: %r", url, e)
+            return True

--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -769,7 +769,7 @@ def test_time_limit(httpd):
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

-    # create a new job with three sites that could be crawled forever
+    # create a new job with one seed that could be crawled forever
    job_conf = {'seeds': [{
        'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
        'time_limit': 20}]}
@ -789,6 +789,10 @@ def test_time_limit(httpd):
    assert sites[0].status == 'FINISHED_TIME_LIMIT'

    # all sites finished so job should be finished too
+    start = time.time()
    job.refresh()
+    while not job.status == 'FINISHED' and time.time() - start < 10:
+        time.sleep(0.5)
+        job.refresh()
    assert job.status == 'FINISHED'

--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -32,6 +32,7 @@ import uuid
 import socket
 import time
 import sys
+import threading

 logging.basicConfig(
        stream=sys.stderr, level=logging.INFO, format=(
@ -67,6 +68,87 @@ def test_robots(httpd):
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
    assert not brozzler.is_permitted_by_robots(site, url)

+def test_robots_http_statuses():
+    for status in (
+            200, 204, 400, 401, 402, 403, 404, 405,
+            500, 501, 502, 503, 504, 505):
+        class Handler(http.server.BaseHTTPRequestHandler):
+            def do_GET(self):
+                response = (('HTTP/1.1 %s Meaningless message\r\n'
+                          + 'Content-length: 0\r\n'
+                          + '\r\n') % status).encode('utf-8')
+                self.connection.sendall(response)
+                # self.send_response(status)
+                # self.end_headers()
+        httpd = http.server.HTTPServer(('localhost', 0), Handler)
+        httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+        httpd_thread.start()
+
+        try:
+            url = 'http://localhost:%s/' % httpd.server_port
+            site = brozzler.Site(None, {'seed': url})
+            assert brozzler.is_permitted_by_robots(site, url)
+        finally:
+            httpd.shutdown()
+            httpd.server_close()
+            httpd_thread.join()
+
+def test_robots_empty_response():
+    class Handler(http.server.BaseHTTPRequestHandler):
+        def do_GET(self):
+            self.connection.shutdown(socket.SHUT_RDWR)
+            self.connection.close()
+    httpd = http.server.HTTPServer(('localhost', 0), Handler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    try:
+        url = 'http://localhost:%s/' % httpd.server_port
+        site = brozzler.Site(None, {'seed': url})
+        assert brozzler.is_permitted_by_robots(site, url)
+    finally:
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+
+def test_robots_socket_timeout():
+    stop_hanging = threading.Event()
+    class Handler(http.server.BaseHTTPRequestHandler):
+        def do_GET(self):
+            stop_hanging.wait(60)
+            self.connection.sendall(
+                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
+
+    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
+
+    httpd = http.server.HTTPServer(('localhost', 0), Handler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    try:
+        url = 'http://localhost:%s/' % httpd.server_port
+        site = brozzler.Site(None, {'seed': url})
+        brozzler.robots._SessionRaiseOn420.timeout = 2
+        assert brozzler.is_permitted_by_robots(site, url)
+    finally:
+        brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
+        stop_hanging.set()
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+
+def test_robots_dns_failure():
+    # .invalid. is guaranteed nonexistent per rfc 6761
+    url = 'http://whatever.invalid./'
+    site = brozzler.Site(None, {'seed': url})
+    assert brozzler.is_permitted_by_robots(site, url)
+
+def test_robots_connection_failure():
+    # .invalid. is guaranteed nonexistent per rfc 6761
+    url = 'http://localhost:4/' # nobody listens on port 4
+    site = brozzler.Site(None, {'seed': url})
+    assert brozzler.is_permitted_by_robots(site, url)
+
 def test_scoping():
    test_scope = yaml.load('''
 max_hops: 100