mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
rudimentary robots.txt support
This commit is contained in:
parent
eb74967fed
commit
18ca996216
@ -148,20 +148,19 @@ class BrozzlerHQ:
|
|||||||
new_site = umbra.hq.Site(**msg.payload)
|
new_site = umbra.hq.Site(**msg.payload)
|
||||||
msg.ack()
|
msg.ack()
|
||||||
|
|
||||||
|
self.logger.info("new site {}".format(new_site))
|
||||||
site_id = self._db.new_site(new_site)
|
site_id = self._db.new_site(new_site)
|
||||||
new_site.id = site_id
|
new_site.id = site_id
|
||||||
|
|
||||||
self._schedule_seed(site_id, new_site.seed)
|
if new_site.is_permitted_by_robots(new_site.seed):
|
||||||
|
crawl_url = umbra.CrawlUrl(new_site.seed, site_id=new_site.id, hops_from_seed=0)
|
||||||
|
self._db.schedule_url(crawl_url, priority=1000)
|
||||||
self._unclaimed_sites_q.put(new_site.to_dict())
|
self._unclaimed_sites_q.put(new_site.to_dict())
|
||||||
self.logger.info("new site {}".format(new_site))
|
else:
|
||||||
|
self.logger.warn("seed url {} is blocked by robots.txt".format(new_site.seed))
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _schedule_seed(self, site_id, seed_url):
|
|
||||||
crawl_url = umbra.CrawlUrl(seed_url, site_id=site_id, hops_from_seed=0)
|
|
||||||
self._db.schedule_url(crawl_url, priority=1000)
|
|
||||||
|
|
||||||
def _feed_crawl_urls(self):
|
def _feed_crawl_urls(self):
|
||||||
for site in self._db.sites():
|
for site in self._db.sites():
|
||||||
q = self._conn.SimpleQueue("brozzler.sites.{}.crawl_urls".format(site.id))
|
q = self._conn.SimpleQueue("brozzler.sites.{}.crawl_urls".format(site.id))
|
||||||
@ -171,6 +170,27 @@ class BrozzlerHQ:
|
|||||||
self.logger.info("feeding {} to {}".format(url, q.queue.name))
|
self.logger.info("feeding {} to {}".format(url, q.queue.name))
|
||||||
q.put(url)
|
q.put(url)
|
||||||
|
|
||||||
|
def _scope_and_schedule_outlinks(self, site, parent_url):
|
||||||
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
|
if parent_url.outlinks:
|
||||||
|
for url in parent_url.outlinks:
|
||||||
|
if site.is_in_scope(url):
|
||||||
|
if site.is_permitted_by_robots(url):
|
||||||
|
crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=parent_url.hops_from_seed+1)
|
||||||
|
try:
|
||||||
|
self._db.update_crawl_url(crawl_url)
|
||||||
|
counts["updated"] += 1
|
||||||
|
except KeyError:
|
||||||
|
self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority())
|
||||||
|
counts["added"] += 1
|
||||||
|
else:
|
||||||
|
counts["blocked"] += 1
|
||||||
|
else:
|
||||||
|
counts["rejected"] += 1
|
||||||
|
|
||||||
|
self.logger.info("{} new links added, {} existing links updated, {} links rejected, {} links blocked by robots from {}".format(
|
||||||
|
counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_url))
|
||||||
|
|
||||||
def _consume_completed_url(self):
|
def _consume_completed_url(self):
|
||||||
for site in self._db.sites():
|
for site in self._db.sites():
|
||||||
q = self._conn.SimpleQueue("brozzler.sites.{}.completed_urls".format(site.id))
|
q = self._conn.SimpleQueue("brozzler.sites.{}.completed_urls".format(site.id))
|
||||||
@ -179,23 +199,7 @@ class BrozzlerHQ:
|
|||||||
completed_url = umbra.CrawlUrl(**msg.payload)
|
completed_url = umbra.CrawlUrl(**msg.payload)
|
||||||
msg.ack()
|
msg.ack()
|
||||||
self._db.completed(completed_url)
|
self._db.completed(completed_url)
|
||||||
|
self._scope_and_schedule_outlinks(site, completed_url)
|
||||||
counts = {"added":0,"updated":0,"rejected":0}
|
|
||||||
if completed_url.outlinks:
|
|
||||||
for url in completed_url.outlinks:
|
|
||||||
if site.is_in_scope(url):
|
|
||||||
crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=completed_url.hops_from_seed+1)
|
|
||||||
try:
|
|
||||||
self._db.update_crawl_url(crawl_url)
|
|
||||||
counts["updated"] += 1
|
|
||||||
except KeyError:
|
|
||||||
self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority())
|
|
||||||
counts["added"] += 1
|
|
||||||
else:
|
|
||||||
counts["rejected"] += 1
|
|
||||||
|
|
||||||
self.logger.info("{} new links added, {} existing links updated, {} links rejected from {}".format(
|
|
||||||
counts["added"], counts["updated"], counts["rejected"], completed_url))
|
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -221,3 +225,4 @@ try:
|
|||||||
hq.run()
|
hq.run()
|
||||||
except ShutdownRequested as e:
|
except ShutdownRequested as e:
|
||||||
logging.info("{}".format(e))
|
logging.info("{}".format(e))
|
||||||
|
|
||||||
|
27
umbra/hq.py
27
umbra/hq.py
@ -4,6 +4,28 @@ import surt
|
|||||||
import kombu
|
import kombu
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import urllib.robotparser
|
||||||
|
|
||||||
|
# robots_url : RobotsFileParser
|
||||||
|
_robots_cache = {}
|
||||||
|
def robots(robots_url):
|
||||||
|
if not robots_url in _robots_cache:
|
||||||
|
robots_txt = urllib.robotparser.RobotFileParser(robots_url)
|
||||||
|
logging.info("fetching {}".format(robots_url))
|
||||||
|
try:
|
||||||
|
robots_txt.read() # XXX should fetch through proxy
|
||||||
|
_robots_cache[robots_url] = robots_txt
|
||||||
|
except BaseException as e:
|
||||||
|
logger.error("problem fetching {}".format(robots_url))
|
||||||
|
|
||||||
|
return _robots_cache[robots_url]
|
||||||
|
|
||||||
|
def robots_url(url):
|
||||||
|
hurl = surt.handyurl.parse(url)
|
||||||
|
hurl.path = "/robots.txt"
|
||||||
|
hurl.query = None
|
||||||
|
hurl.hash = None
|
||||||
|
return hurl.geturl()
|
||||||
|
|
||||||
class Site:
|
class Site:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
@ -13,12 +35,15 @@ class Site:
|
|||||||
self.id = id
|
self.id = id
|
||||||
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
|
||||||
|
def is_permitted_by_robots(self, url):
|
||||||
|
return robots(robots_url(url)).can_fetch("*", url)
|
||||||
|
|
||||||
def is_in_scope(self, url):
|
def is_in_scope(self, url):
|
||||||
try:
|
try:
|
||||||
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
return surtt.startswith(self.scope_surt)
|
return surtt.startswith(self.scope_surt)
|
||||||
except:
|
except:
|
||||||
self.logger.warn("""problem parsing url "{}" """.format(url), exc_info=True)
|
self.logger.warn("""problem parsing url "{}" """.format(url))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user