mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
support for max_hops scope rule
This commit is contained in:
parent
a47292dab5
commit
e96b16e19a
@ -179,15 +179,13 @@ class BrozzlerHQ:
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
try:
|
||||
self.logger.info("checking robots for %s", url)
|
||||
result = self._robots_cache(site).allowed(url, "brozzler")
|
||||
self.logger.info("robots allowed=%s for %s", result, url)
|
||||
return result
|
||||
except BaseException as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||
raise e.args[0]
|
||||
else:
|
||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
||||
self.logger.error("problem with robots.txt for %s: %s", url, repr(e))
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
@ -262,7 +260,7 @@ class BrozzlerHQ:
|
||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||
if parent_page.outlinks:
|
||||
for url in parent_page.outlinks:
|
||||
if site.is_in_scope(url):
|
||||
if site.is_in_scope(url, parent_page):
|
||||
if self.is_permitted_by_robots(site, url):
|
||||
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
||||
try:
|
||||
|
@ -21,7 +21,7 @@ class Site:
|
||||
self.reached_limit = reached_limit
|
||||
|
||||
self.scope = scope or {}
|
||||
if not "surt" in scope:
|
||||
if not "surt" in self.scope:
|
||||
self.scope["surt"] = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
|
||||
|
||||
def __repr__(self):
|
||||
@ -45,7 +45,10 @@ class Site:
|
||||
else:
|
||||
self.reached_limit = e.warcprox_meta["reached-limit"]
|
||||
|
||||
def is_in_scope(self, url):
|
||||
def is_in_scope(self, url, parent_page=None):
|
||||
if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]:
|
||||
return False
|
||||
|
||||
try:
|
||||
hurl = surt.handyurl.parse(url)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user