diff --git a/brozzler/robots.py b/brozzler/robots.py index 04e4b80..583c5b9 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -33,13 +33,20 @@ def _robots_cache(site): def is_permitted_by_robots(site, url): if site.ignore_robots: return True - try: - result = _robots_cache(site).allowed(url, "brozzler") - return result - except BaseException as e: - if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): - raise e.args[0] - else: - logging.error("problem with robots.txt for %s: %s", url, repr(e), exc_info=True) - return False + + tries_left = 10 + while True: + try: + result = _robots_cache(site).allowed(url, "brozzler") + return result + except BaseException as e: + if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): + raise e.args[0] + else: + if tries_left > 0: + logging.warn("caught exception %s fetching robots.txt (%s tries left) for %s", repr(e), tries_left, url) + tries_left -= 1 + else: + logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True) + return False