since I can't figure out what's causing these sporadic errors fetching certain robots.txt through warcprox, stick a retry loop around the fetch

This commit is contained in:
Noah Levitt 2015-08-19 22:50:04 +00:00
parent ad543e6134
commit 8b45d7eb69

View File

@ -33,6 +33,9 @@ def _robots_cache(site):
def is_permitted_by_robots(site, url):
if site.ignore_robots:
return True
tries_left = 10
while True:
try:
result = _robots_cache(site).allowed(url, "brozzler")
return result
@ -40,6 +43,10 @@ def is_permitted_by_robots(site, url):
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
raise e.args[0]
else:
logging.error("problem with robots.txt for %s: %s", url, repr(e), exc_info=True)
if tries_left > 0:
logging.warn("caught exception %s fetching robots.txt (%s tries left) for %s", repr(e), tries_left, url)
tries_left -= 1
else:
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
return False