mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
support for max_hops scope rule
This commit is contained in:
parent
a47292dab5
commit
e96b16e19a
@ -179,15 +179,13 @@ class BrozzlerHQ:
|
|||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
try:
|
try:
|
||||||
self.logger.info("checking robots for %s", url)
|
|
||||||
result = self._robots_cache(site).allowed(url, "brozzler")
|
result = self._robots_cache(site).allowed(url, "brozzler")
|
||||||
self.logger.info("robots allowed=%s for %s", result, url)
|
|
||||||
return result
|
return result
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||||
raise e.args[0]
|
raise e.args[0]
|
||||||
else:
|
else:
|
||||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
self.logger.error("problem with robots.txt for %s: %s", url, repr(e))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
@ -262,7 +260,7 @@ class BrozzlerHQ:
|
|||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
if parent_page.outlinks:
|
if parent_page.outlinks:
|
||||||
for url in parent_page.outlinks:
|
for url in parent_page.outlinks:
|
||||||
if site.is_in_scope(url):
|
if site.is_in_scope(url, parent_page):
|
||||||
if self.is_permitted_by_robots(site, url):
|
if self.is_permitted_by_robots(site, url):
|
||||||
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
||||||
try:
|
try:
|
||||||
|
@ -21,7 +21,7 @@ class Site:
|
|||||||
self.reached_limit = reached_limit
|
self.reached_limit = reached_limit
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in scope:
|
if not "surt" in self.scope:
|
||||||
self.scope["surt"] = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
|
self.scope["surt"] = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
@ -45,7 +45,10 @@ class Site:
|
|||||||
else:
|
else:
|
||||||
self.reached_limit = e.warcprox_meta["reached-limit"]
|
self.reached_limit = e.warcprox_meta["reached-limit"]
|
||||||
|
|
||||||
def is_in_scope(self, url):
|
def is_in_scope(self, url, parent_page=None):
|
||||||
|
if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]:
|
||||||
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hurl = surt.handyurl.parse(url)
|
hurl = surt.handyurl.parse(url)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user