mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
since I can't figure out what's causing these sporadic errors fetching certain robots.txt through warcprox, stick a retry loop around the fetch
This commit is contained in:
parent
ad543e6134
commit
8b45d7eb69
@ -33,13 +33,20 @@ def _robots_cache(site):
|
|||||||
def is_permitted_by_robots(site, url):
|
def is_permitted_by_robots(site, url):
|
||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
try:
|
|
||||||
result = _robots_cache(site).allowed(url, "brozzler")
|
tries_left = 10
|
||||||
return result
|
while True:
|
||||||
except BaseException as e:
|
try:
|
||||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
result = _robots_cache(site).allowed(url, "brozzler")
|
||||||
raise e.args[0]
|
return result
|
||||||
else:
|
except BaseException as e:
|
||||||
logging.error("problem with robots.txt for %s: %s", url, repr(e), exc_info=True)
|
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||||
return False
|
raise e.args[0]
|
||||||
|
else:
|
||||||
|
if tries_left > 0:
|
||||||
|
logging.warn("caught exception %s fetching robots.txt (%s tries left) for %s", repr(e), tries_left, url)
|
||||||
|
tries_left -= 1
|
||||||
|
else:
|
||||||
|
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user