mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
handle 420 Reached limit when fetching robots in brozzler-hq
This commit is contained in:
parent
511e19ff4d
commit
e6eeca6ae2
@ -17,7 +17,7 @@ class ShutdownRequested(Exception):
|
||||
pass
|
||||
|
||||
class ReachedLimit(Exception):
|
||||
def __init__(self, http_error=None, warcprox_meta=None):
|
||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||
if http_error:
|
||||
if "warcprox-meta" in http_error.headers:
|
||||
self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"])
|
||||
@ -26,7 +26,7 @@ class ReachedLimit(Exception):
|
||||
self.http_payload = http_error.read()
|
||||
elif warcprox_meta:
|
||||
self.warcprox_meta = warcprox_meta
|
||||
self.http_payload = None
|
||||
self.http_payload = http_payload
|
||||
|
||||
def __repr__(self):
|
||||
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))
|
||||
|
@ -238,7 +238,7 @@ class Browser:
|
||||
if (not self._reached_limit
|
||||
and message["params"]["response"]["status"] == 420
|
||||
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
|
||||
warcprox_meta = json.loads(message["params"]["response"]["headers"]["Warcprox-Meta"])
|
||||
warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"])
|
||||
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
||||
self.logger.info("reached limit %s", self._reached_limit)
|
||||
|
||||
|
@ -155,8 +155,16 @@ class BrozzlerHQ:
|
||||
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
|
||||
def _robots_cache(self, site):
|
||||
class SessionRaiseOn420(requests.Session):
|
||||
def get(self, url, **kwargs):
|
||||
res = super().get(url, **kwargs)
|
||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||
raise brozzler.ReachedLimit(warcprox_meta=json.loads(res.headers['warcprox-meta']), http_payload=res.text)
|
||||
else:
|
||||
return response
|
||||
|
||||
if not site.id in self._robots_caches:
|
||||
req_sesh = requests.Session()
|
||||
req_sesh = SessionRaiseOn420()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if site.proxy:
|
||||
proxie = "http://{}".format(site.proxy)
|
||||
@ -171,8 +179,14 @@ class BrozzlerHQ:
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
try:
|
||||
return self._robots_cache(site).allowed(url, "brozzler")
|
||||
self.logger.info("checking robots for %s", url)
|
||||
result = self._robots_cache(site).allowed(url, "brozzler")
|
||||
self.logger.info("robots allowed=%s for %s", result, url)
|
||||
return result
|
||||
except BaseException as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||
raise e.args[0]
|
||||
else:
|
||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
||||
return False
|
||||
|
||||
@ -209,19 +223,23 @@ class BrozzlerHQ:
|
||||
def _new_site(self):
|
||||
try:
|
||||
msg = self._new_sites_q.get(block=False)
|
||||
new_site = brozzler.Site(**msg.payload)
|
||||
site = brozzler.Site(**msg.payload)
|
||||
msg.ack()
|
||||
|
||||
self.logger.info("new site {}".format(new_site))
|
||||
site_id = self._db.new_site(new_site)
|
||||
new_site.id = site_id
|
||||
self.logger.info("new site {}".format(site))
|
||||
site_id = self._db.new_site(site)
|
||||
site.id = site_id
|
||||
|
||||
if self.is_permitted_by_robots(new_site, new_site.seed):
|
||||
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
|
||||
try:
|
||||
if self.is_permitted_by_robots(site, site.seed):
|
||||
page = brozzler.Page(site.seed, site_id=site.id, hops_from_seed=0)
|
||||
self._db.schedule_page(page, priority=1000)
|
||||
self._unclaimed_sites_q.put(new_site.to_dict())
|
||||
self._unclaimed_sites_q.put(site.to_dict())
|
||||
else:
|
||||
self.logger.warn("seed url {} is blocked by robots.txt".format(new_site.seed))
|
||||
self.logger.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||
except brozzler.ReachedLimit as e:
|
||||
site.note_limit_reached(e)
|
||||
self._db.update_site(site)
|
||||
except kombu.simple.Empty:
|
||||
pass
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user