diff --git a/brozzler/robots.py b/brozzler/robots.py index aef9913..5b96423 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent): return self.agents.get('*') reppy.parser.Rules.__getitem__ = _reppy_rules_getitem +class _SessionRaiseOn420(requests.Session): + timeout = 60 + def get(self, url, *args, **kwargs): + res = super().get(url, timeout=self.timeout, *args, **kwargs) + if res.status_code == 420 and 'warcprox-meta' in res.headers: + raise brozzler.ReachedLimit( + warcprox_meta=json.loads(res.headers['warcprox-meta']), + http_payload=res.text) + else: + return res + _robots_caches = {} # {site_id:reppy.cache.RobotsCache} def _robots_cache(site, proxy=None): - class SessionRaiseOn420(requests.Session): - def get(self, url, *args, **kwargs): - res = super().get(url, *args, **kwargs) - if res.status_code == 420 and 'warcprox-meta' in res.headers: - raise brozzler.ReachedLimit( - warcprox_meta=json.loads(res.headers['warcprox-meta']), - http_payload=res.text) - else: - return res - if not site.id in _robots_caches: - req_sesh = SessionRaiseOn420() + req_sesh = _SessionRaiseOn420() req_sesh.verify = False # ignore cert errors if proxy: proxie = "http://%s" % proxy @@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None): req_sesh.headers.update(site.extra_headers()) if site.user_agent: req_sesh.headers['User-Agent'] = site.user_agent - _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh) + _robots_caches[site.id] = reppy.cache.RobotsCache( + session=req_sesh, disallow_forbidden=False) return _robots_caches[site.id] @@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None): ''' Checks if `url` is permitted by robots.txt. - In case of problems fetching robots.txt, different things can happen. - Reppy (the robots.txt parsing library) handles some exceptions internally - and applies an appropriate policy. It bubbles up other exceptions. Of - these, there are two kinds that this function raises for the caller to - handle, described below. Yet other types of exceptions are caught, and the - fetch is retried up to 10 times. In this case, after the 10th failure, the - function returns `False` (i.e. forbidden by robots). + Treats any kind of error fetching robots.txt as "allow all". See + http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI) + for some background on that policy. Returns: bool: `True` if `site.ignore_robots` is set, or if `url` is permitted @@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None): if site.ignore_robots: return True - tries_left = 10 - while True: - try: - result = _robots_cache(site, proxy).allowed( - url, site.user_agent or "brozzler") - return result - except Exception as e: - if isinstance(e, reppy.exceptions.ServerError) and isinstance( - e.args[0], brozzler.ReachedLimit): - raise e.args[0] - elif hasattr(e, 'args') and isinstance( - e.args[0], requests.exceptions.ProxyError): - # reppy has wrapped an exception that we want to bubble up - raise brozzler.ProxyError(e) - else: - if tries_left > 0: - logging.warn( - "caught exception fetching robots.txt (%r tries " - "left) for %r: %r", tries_left, url, e) - tries_left -= 1 - else: - logging.error( - "caught exception fetching robots.txt (0 tries " - "left) for %r: %r", url, e, exc_info=True) - return False + try: + result = _robots_cache(site, proxy).allowed( + url, site.user_agent or "brozzler") + return result + except Exception as e: + if isinstance(e, reppy.exceptions.ServerError) and isinstance( + e.args[0], brozzler.ReachedLimit): + raise e.args[0] + elif hasattr(e, 'args') and isinstance( + e.args[0], requests.exceptions.ProxyError): + # reppy has wrapped an exception that we want to bubble up + raise brozzler.ProxyError(e) + else: + logging.warn( + "returning true (permitted) after problem fetching " + "robots.txt for %r: %r", url, e) + return True diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 0ec5026..ae4f7a4 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -769,7 +769,7 @@ def test_time_limit(httpd): rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) - # create a new job with three sites that could be crawled forever + # create a new job with one seed that could be crawled forever job_conf = {'seeds': [{ 'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port, 'time_limit': 20}]} @@ -789,6 +789,10 @@ def test_time_limit(httpd): assert sites[0].status == 'FINISHED_TIME_LIMIT' # all sites finished so job should be finished too + start = time.time() job.refresh() + while not job.status == 'FINISHED' and time.time() - start < 10: + time.sleep(0.5) + job.refresh() assert job.status == 'FINISHED' diff --git a/tests/test_units.py b/tests/test_units.py index eed034e..47307d0 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -32,6 +32,7 @@ import uuid import socket import time import sys +import threading logging.basicConfig( stream=sys.stderr, level=logging.INFO, format=( @@ -67,6 +68,87 @@ def test_robots(httpd): site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) assert not brozzler.is_permitted_by_robots(site, url) +def test_robots_http_statuses(): + for status in ( + 200, 204, 400, 401, 402, 403, 404, 405, + 500, 501, 502, 503, 504, 505): + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + response = (('HTTP/1.1 %s Meaningless message\r\n' + + 'Content-length: 0\r\n' + + '\r\n') % status).encode('utf-8') + self.connection.sendall(response) + # self.send_response(status) + # self.end_headers() + httpd = http.server.HTTPServer(('localhost', 0), Handler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + try: + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + finally: + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + +def test_robots_empty_response(): + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + self.connection.shutdown(socket.SHUT_RDWR) + self.connection.close() + httpd = http.server.HTTPServer(('localhost', 0), Handler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + try: + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + finally: + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + +def test_robots_socket_timeout(): + stop_hanging = threading.Event() + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + stop_hanging.wait(60) + self.connection.sendall( + b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n') + + orig_timeout = brozzler.robots._SessionRaiseOn420.timeout + + httpd = http.server.HTTPServer(('localhost', 0), Handler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + try: + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(None, {'seed': url}) + brozzler.robots._SessionRaiseOn420.timeout = 2 + assert brozzler.is_permitted_by_robots(site, url) + finally: + brozzler.robots._SessionRaiseOn420.timeout = orig_timeout + stop_hanging.set() + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + +def test_robots_dns_failure(): + # .invalid. is guaranteed nonexistent per rfc 6761 + url = 'http://whatever.invalid./' + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + +def test_robots_connection_failure(): + # .invalid. is guaranteed nonexistent per rfc 6761 + url = 'http://localhost:4/' # nobody listens on port 4 + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + def test_scoping(): test_scope = yaml.load(''' max_hops: 100