mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
since I can't figure out what's causing these sporadic errors fetching certain robots.txt through warcprox, stick a retry loop around the fetch
This commit is contained in:
parent
ad543e6134
commit
8b45d7eb69
@ -33,6 +33,9 @@ def _robots_cache(site):
|
||||
def is_permitted_by_robots(site, url):
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
|
||||
tries_left = 10
|
||||
while True:
|
||||
try:
|
||||
result = _robots_cache(site).allowed(url, "brozzler")
|
||||
return result
|
||||
@ -40,6 +43,10 @@ def is_permitted_by_robots(site, url):
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||
raise e.args[0]
|
||||
else:
|
||||
logging.error("problem with robots.txt for %s: %s", url, repr(e), exc_info=True)
|
||||
if tries_left > 0:
|
||||
logging.warn("caught exception %s fetching robots.txt (%s tries left) for %s", repr(e), tries_left, url)
|
||||
tries_left -= 1
|
||||
else:
|
||||
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
|
||||
return False
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user