support for max_hops scope rule

This commit is contained in:
Noah Levitt 2015-08-07 22:36:39 +00:00
parent a47292dab5
commit e96b16e19a
2 changed files with 7 additions and 6 deletions

View File

@ -179,15 +179,13 @@ class BrozzlerHQ:
if site.ignore_robots:
return True
try:
self.logger.info("checking robots for %s", url)
result = self._robots_cache(site).allowed(url, "brozzler")
self.logger.info("robots allowed=%s for %s", result, url)
return result
except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
raise e.args[0]
else:
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
self.logger.error("problem with robots.txt for %s: %s", url, repr(e))
return False
def run(self):
@ -262,7 +260,7 @@ class BrozzlerHQ:
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
if parent_page.outlinks:
for url in parent_page.outlinks:
if site.is_in_scope(url):
if site.is_in_scope(url, parent_page):
if self.is_permitted_by_robots(site, url):
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
try:

View File

@ -21,7 +21,7 @@ class Site:
self.reached_limit = reached_limit
self.scope = scope or {}
if not "surt" in scope:
if not "surt" in self.scope:
self.scope["surt"] = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
def __repr__(self):
@ -45,7 +45,10 @@ class Site:
else:
self.reached_limit = e.warcprox_meta["reached-limit"]
def is_in_scope(self, url):
def is_in_scope(self, url, parent_page=None):
if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]:
return False
try:
hurl = surt.handyurl.parse(url)