handle 420 Reached limit when fetching robots in brozzler-hq

2025-07-25 07:45:38 -04:00 · 2015-08-01 17:54:29 +00:00 · 2015-08-01 17:54:29 +00:00 · e6eeca6ae2
commit e6eeca6ae2
parent 511e19ff4d
3 changed files with 35 additions and 17 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -17,7 +17,7 @@ class ShutdownRequested(Exception):
    pass

 class ReachedLimit(Exception):
-    def __init__(self, http_error=None, warcprox_meta=None):
+    def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
        if http_error:
            if "warcprox-meta" in http_error.headers:
                self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"])
@ -26,7 +26,7 @@ class ReachedLimit(Exception):
            self.http_payload = http_error.read()
        elif warcprox_meta:
            self.warcprox_meta = warcprox_meta
-            self.http_payload = None
+            self.http_payload = http_payload

    def __repr__(self):
        return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -238,7 +238,7 @@ class Browser:
        if (not self._reached_limit
                and message["params"]["response"]["status"] == 420
                and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
-            warcprox_meta = json.loads(message["params"]["response"]["headers"]["Warcprox-Meta"])
+            warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"])
            self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
            self.logger.info("reached limit %s", self._reached_limit)

--- a/brozzler/hq.py
+++ b/brozzler/hq.py
@ -155,8 +155,16 @@ class BrozzlerHQ:
        self._robots_caches = {}  # {site_id:reppy.cache.RobotsCache}

    def _robots_cache(self, site):
+        class SessionRaiseOn420(requests.Session):
+            def get(self, url, **kwargs):
+                res = super().get(url, **kwargs)
+                if res.status_code == 420 and 'warcprox-meta' in res.headers:
+                    raise brozzler.ReachedLimit(warcprox_meta=json.loads(res.headers['warcprox-meta']), http_payload=res.text)
+                else:
+                    return response
+
        if not site.id in self._robots_caches:
-            req_sesh = requests.Session()
+            req_sesh = SessionRaiseOn420()
            req_sesh.verify = False   # ignore cert errors
            if site.proxy:
                proxie = "http://{}".format(site.proxy)
@ -171,8 +179,14 @@ class BrozzlerHQ:
        if site.ignore_robots:
            return True
        try:
-            return self._robots_cache(site).allowed(url, "brozzler")
+            self.logger.info("checking robots for %s", url)
+            result = self._robots_cache(site).allowed(url, "brozzler")
+            self.logger.info("robots allowed=%s for %s", result, url)
+            return result
        except BaseException as e:
+            if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
+                raise e.args[0]
+            else:
                self.logger.error("problem with robots.txt for {}: {}".format(url, e))
                return False

@ -209,19 +223,23 @@ class BrozzlerHQ:
    def _new_site(self):
        try:
            msg = self._new_sites_q.get(block=False)
-            new_site = brozzler.Site(**msg.payload)
+            site = brozzler.Site(**msg.payload)
            msg.ack()

-            self.logger.info("new site {}".format(new_site))
-            site_id = self._db.new_site(new_site)
-            new_site.id = site_id
+            self.logger.info("new site {}".format(site))
+            site_id = self._db.new_site(site)
+            site.id = site_id

-            if self.is_permitted_by_robots(new_site, new_site.seed):
-                page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
+            try:
+                if self.is_permitted_by_robots(site, site.seed):
+                    page = brozzler.Page(site.seed, site_id=site.id, hops_from_seed=0)
                    self._db.schedule_page(page, priority=1000)
-                self._unclaimed_sites_q.put(new_site.to_dict())
+                    self._unclaimed_sites_q.put(site.to_dict())
                else:
-                self.logger.warn("seed url {} is blocked by robots.txt".format(new_site.seed))
+                    self.logger.warn("seed url {} is blocked by robots.txt".format(site.seed))
+            except brozzler.ReachedLimit as e:
+                site.note_limit_reached(e)
+                self._db.update_site(site)
        except kombu.simple.Empty:
            pass