From 18ca996216135a4c6e9ef38321285fca1397a05e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 13 Jul 2015 15:56:54 -0700 Subject: [PATCH] rudimentary robots.txt support --- bin/brozzler-hq | 55 +++++++++++++++++++++++++++---------------------- umbra/hq.py | 29 ++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/bin/brozzler-hq b/bin/brozzler-hq index bed16e2..1014287 100644 --- a/bin/brozzler-hq +++ b/bin/brozzler-hq @@ -148,20 +148,19 @@ class BrozzlerHQ: new_site = umbra.hq.Site(**msg.payload) msg.ack() + self.logger.info("new site {}".format(new_site)) site_id = self._db.new_site(new_site) new_site.id = site_id - self._schedule_seed(site_id, new_site.seed) - - self._unclaimed_sites_q.put(new_site.to_dict()) - self.logger.info("new site {}".format(new_site)) + if new_site.is_permitted_by_robots(new_site.seed): + crawl_url = umbra.CrawlUrl(new_site.seed, site_id=new_site.id, hops_from_seed=0) + self._db.schedule_url(crawl_url, priority=1000) + self._unclaimed_sites_q.put(new_site.to_dict()) + else: + self.logger.warn("seed url {} is blocked by robots.txt".format(new_site.seed)) except kombu.simple.Empty: pass - def _schedule_seed(self, site_id, seed_url): - crawl_url = umbra.CrawlUrl(seed_url, site_id=site_id, hops_from_seed=0) - self._db.schedule_url(crawl_url, priority=1000) - def _feed_crawl_urls(self): for site in self._db.sites(): q = self._conn.SimpleQueue("brozzler.sites.{}.crawl_urls".format(site.id)) @@ -171,6 +170,27 @@ class BrozzlerHQ: self.logger.info("feeding {} to {}".format(url, q.queue.name)) q.put(url) + def _scope_and_schedule_outlinks(self, site, parent_url): + counts = {"added":0,"updated":0,"rejected":0,"blocked":0} + if parent_url.outlinks: + for url in parent_url.outlinks: + if site.is_in_scope(url): + if site.is_permitted_by_robots(url): + crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=parent_url.hops_from_seed+1) + try: + self._db.update_crawl_url(crawl_url) + counts["updated"] += 1 + except KeyError: + self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority()) + counts["added"] += 1 + else: + counts["blocked"] += 1 + else: + counts["rejected"] += 1 + + self.logger.info("{} new links added, {} existing links updated, {} links rejected, {} links blocked by robots from {}".format( + counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_url)) + def _consume_completed_url(self): for site in self._db.sites(): q = self._conn.SimpleQueue("brozzler.sites.{}.completed_urls".format(site.id)) @@ -179,23 +199,7 @@ class BrozzlerHQ: completed_url = umbra.CrawlUrl(**msg.payload) msg.ack() self._db.completed(completed_url) - - counts = {"added":0,"updated":0,"rejected":0} - if completed_url.outlinks: - for url in completed_url.outlinks: - if site.is_in_scope(url): - crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=completed_url.hops_from_seed+1) - try: - self._db.update_crawl_url(crawl_url) - counts["updated"] += 1 - except KeyError: - self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority()) - counts["added"] += 1 - else: - counts["rejected"] += 1 - - self.logger.info("{} new links added, {} existing links updated, {} links rejected from {}".format( - counts["added"], counts["updated"], counts["rejected"], completed_url)) + self._scope_and_schedule_outlinks(site, completed_url) except kombu.simple.Empty: pass @@ -221,3 +225,4 @@ try: hq.run() except ShutdownRequested as e: logging.info("{}".format(e)) + diff --git a/umbra/hq.py b/umbra/hq.py index fc11db6..9cadb15 100644 --- a/umbra/hq.py +++ b/umbra/hq.py @@ -4,6 +4,28 @@ import surt import kombu import json import logging +import urllib.robotparser + +# robots_url : RobotsFileParser +_robots_cache = {} +def robots(robots_url): + if not robots_url in _robots_cache: + robots_txt = urllib.robotparser.RobotFileParser(robots_url) + logging.info("fetching {}".format(robots_url)) + try: + robots_txt.read() # XXX should fetch through proxy + _robots_cache[robots_url] = robots_txt + except BaseException as e: + logger.error("problem fetching {}".format(robots_url)) + + return _robots_cache[robots_url] + +def robots_url(url): + hurl = surt.handyurl.parse(url) + hurl.path = "/robots.txt" + hurl.query = None + hurl.hash = None + return hurl.geturl() class Site: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -13,12 +35,15 @@ class Site: self.id = id self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) + def is_permitted_by_robots(self, url): + return robots(robots_url(url)).can_fetch("*", url) + def is_in_scope(self, url): try: surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) return surtt.startswith(self.scope_surt) except: - self.logger.warn("""problem parsing url "{}" """.format(url), exc_info=True) + self.logger.warn("""problem parsing url "{}" """.format(url)) return False def to_dict(self): @@ -26,4 +51,4 @@ class Site: def to_json(self): return json.dumps(self.to_dict(), separators=(',', ':')) - +