mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-07 14:43:04 -04:00
handle 420 Reached limit when fetching robots in brozzler-hq
This commit is contained in:
parent
511e19ff4d
commit
e6eeca6ae2
3 changed files with 35 additions and 17 deletions
|
@ -17,7 +17,7 @@ class ShutdownRequested(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class ReachedLimit(Exception):
|
class ReachedLimit(Exception):
|
||||||
def __init__(self, http_error=None, warcprox_meta=None):
|
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||||
if http_error:
|
if http_error:
|
||||||
if "warcprox-meta" in http_error.headers:
|
if "warcprox-meta" in http_error.headers:
|
||||||
self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"])
|
self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"])
|
||||||
|
@ -26,7 +26,7 @@ class ReachedLimit(Exception):
|
||||||
self.http_payload = http_error.read()
|
self.http_payload = http_error.read()
|
||||||
elif warcprox_meta:
|
elif warcprox_meta:
|
||||||
self.warcprox_meta = warcprox_meta
|
self.warcprox_meta = warcprox_meta
|
||||||
self.http_payload = None
|
self.http_payload = http_payload
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))
|
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))
|
||||||
|
|
|
@ -238,7 +238,7 @@ class Browser:
|
||||||
if (not self._reached_limit
|
if (not self._reached_limit
|
||||||
and message["params"]["response"]["status"] == 420
|
and message["params"]["response"]["status"] == 420
|
||||||
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
|
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
|
||||||
warcprox_meta = json.loads(message["params"]["response"]["headers"]["Warcprox-Meta"])
|
warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"])
|
||||||
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
||||||
self.logger.info("reached limit %s", self._reached_limit)
|
self.logger.info("reached limit %s", self._reached_limit)
|
||||||
|
|
||||||
|
|
|
@ -155,8 +155,16 @@ class BrozzlerHQ:
|
||||||
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
|
|
||||||
def _robots_cache(self, site):
|
def _robots_cache(self, site):
|
||||||
|
class SessionRaiseOn420(requests.Session):
|
||||||
|
def get(self, url, **kwargs):
|
||||||
|
res = super().get(url, **kwargs)
|
||||||
|
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||||
|
raise brozzler.ReachedLimit(warcprox_meta=json.loads(res.headers['warcprox-meta']), http_payload=res.text)
|
||||||
|
else:
|
||||||
|
return response
|
||||||
|
|
||||||
if not site.id in self._robots_caches:
|
if not site.id in self._robots_caches:
|
||||||
req_sesh = requests.Session()
|
req_sesh = SessionRaiseOn420()
|
||||||
req_sesh.verify = False # ignore cert errors
|
req_sesh.verify = False # ignore cert errors
|
||||||
if site.proxy:
|
if site.proxy:
|
||||||
proxie = "http://{}".format(site.proxy)
|
proxie = "http://{}".format(site.proxy)
|
||||||
|
@ -171,10 +179,16 @@ class BrozzlerHQ:
|
||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
try:
|
try:
|
||||||
return self._robots_cache(site).allowed(url, "brozzler")
|
self.logger.info("checking robots for %s", url)
|
||||||
|
result = self._robots_cache(site).allowed(url, "brozzler")
|
||||||
|
self.logger.info("robots allowed=%s for %s", result, url)
|
||||||
|
return result
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||||
return False
|
raise e.args[0]
|
||||||
|
else:
|
||||||
|
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
||||||
|
return False
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
|
@ -209,19 +223,23 @@ class BrozzlerHQ:
|
||||||
def _new_site(self):
|
def _new_site(self):
|
||||||
try:
|
try:
|
||||||
msg = self._new_sites_q.get(block=False)
|
msg = self._new_sites_q.get(block=False)
|
||||||
new_site = brozzler.Site(**msg.payload)
|
site = brozzler.Site(**msg.payload)
|
||||||
msg.ack()
|
msg.ack()
|
||||||
|
|
||||||
self.logger.info("new site {}".format(new_site))
|
self.logger.info("new site {}".format(site))
|
||||||
site_id = self._db.new_site(new_site)
|
site_id = self._db.new_site(site)
|
||||||
new_site.id = site_id
|
site.id = site_id
|
||||||
|
|
||||||
if self.is_permitted_by_robots(new_site, new_site.seed):
|
try:
|
||||||
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
|
if self.is_permitted_by_robots(site, site.seed):
|
||||||
self._db.schedule_page(page, priority=1000)
|
page = brozzler.Page(site.seed, site_id=site.id, hops_from_seed=0)
|
||||||
self._unclaimed_sites_q.put(new_site.to_dict())
|
self._db.schedule_page(page, priority=1000)
|
||||||
else:
|
self._unclaimed_sites_q.put(site.to_dict())
|
||||||
self.logger.warn("seed url {} is blocked by robots.txt".format(new_site.seed))
|
else:
|
||||||
|
self.logger.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||||
|
except brozzler.ReachedLimit as e:
|
||||||
|
site.note_limit_reached(e)
|
||||||
|
self._db.update_site(site)
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue