since I can't figure out what's causing these sporadic errors fetching certain robots.txt through warcprox, stick a retry loop around the fetch

This commit is contained in:
Noah Levitt 2015-08-19 22:50:04 +00:00
parent ad543e6134
commit 8b45d7eb69

View File

@ -33,13 +33,20 @@ def _robots_cache(site):
def is_permitted_by_robots(site, url): def is_permitted_by_robots(site, url):
if site.ignore_robots: if site.ignore_robots:
return True return True
try:
result = _robots_cache(site).allowed(url, "brozzler") tries_left = 10
return result while True:
except BaseException as e: try:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): result = _robots_cache(site).allowed(url, "brozzler")
raise e.args[0] return result
else: except BaseException as e:
logging.error("problem with robots.txt for %s: %s", url, repr(e), exc_info=True) if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
return False raise e.args[0]
else:
if tries_left > 0:
logging.warn("caught exception %s fetching robots.txt (%s tries left) for %s", repr(e), tries_left, url)
tries_left -= 1
else:
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
return False