handle 420 Reached limit when fetching robots in brozzler-hq

This commit is contained in:
Noah Levitt 2015-08-01 17:54:29 +00:00
parent 511e19ff4d
commit e6eeca6ae2
3 changed files with 35 additions and 17 deletions

View File

@ -17,7 +17,7 @@ class ShutdownRequested(Exception):
pass
class ReachedLimit(Exception):
def __init__(self, http_error=None, warcprox_meta=None):
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
if http_error:
if "warcprox-meta" in http_error.headers:
self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"])
@ -26,7 +26,7 @@ class ReachedLimit(Exception):
self.http_payload = http_error.read()
elif warcprox_meta:
self.warcprox_meta = warcprox_meta
self.http_payload = None
self.http_payload = http_payload
def __repr__(self):
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))

View File

@ -238,7 +238,7 @@ class Browser:
if (not self._reached_limit
and message["params"]["response"]["status"] == 420
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
warcprox_meta = json.loads(message["params"]["response"]["headers"]["Warcprox-Meta"])
warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"])
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
self.logger.info("reached limit %s", self._reached_limit)

View File

@ -155,8 +155,16 @@ class BrozzlerHQ:
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(self, site):
class SessionRaiseOn420(requests.Session):
def get(self, url, **kwargs):
res = super().get(url, **kwargs)
if res.status_code == 420 and 'warcprox-meta' in res.headers:
raise brozzler.ReachedLimit(warcprox_meta=json.loads(res.headers['warcprox-meta']), http_payload=res.text)
else:
return response
if not site.id in self._robots_caches:
req_sesh = requests.Session()
req_sesh = SessionRaiseOn420()
req_sesh.verify = False # ignore cert errors
if site.proxy:
proxie = "http://{}".format(site.proxy)
@ -171,8 +179,14 @@ class BrozzlerHQ:
if site.ignore_robots:
return True
try:
return self._robots_cache(site).allowed(url, "brozzler")
self.logger.info("checking robots for %s", url)
result = self._robots_cache(site).allowed(url, "brozzler")
self.logger.info("robots allowed=%s for %s", result, url)
return result
except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
raise e.args[0]
else:
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
return False
@ -209,19 +223,23 @@ class BrozzlerHQ:
def _new_site(self):
try:
msg = self._new_sites_q.get(block=False)
new_site = brozzler.Site(**msg.payload)
site = brozzler.Site(**msg.payload)
msg.ack()
self.logger.info("new site {}".format(new_site))
site_id = self._db.new_site(new_site)
new_site.id = site_id
self.logger.info("new site {}".format(site))
site_id = self._db.new_site(site)
site.id = site_id
if self.is_permitted_by_robots(new_site, new_site.seed):
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
try:
if self.is_permitted_by_robots(site, site.seed):
page = brozzler.Page(site.seed, site_id=site.id, hops_from_seed=0)
self._db.schedule_page(page, priority=1000)
self._unclaimed_sites_q.put(new_site.to_dict())
self._unclaimed_sites_q.put(site.to_dict())
else:
self.logger.warn("seed url {} is blocked by robots.txt".format(new_site.seed))
self.logger.warn("seed url {} is blocked by robots.txt".format(site.seed))
except brozzler.ReachedLimit as e:
site.note_limit_reached(e)
self._db.update_site(site)
except kombu.simple.Empty:
pass