mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
support for one-hop-off (or n-hop-off) scoping
This commit is contained in:
parent
7bc726f717
commit
fee008266f
@ -231,26 +231,32 @@ class RethinkDbFrontier:
|
||||
|
||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||
if outlinks:
|
||||
for url in outlinks:
|
||||
if site.is_in_scope(url, parent_page):
|
||||
if brozzler.is_permitted_by_robots(site, url):
|
||||
new_child_page = brozzler.Page(
|
||||
url, site_id=site.id, job_id=site.job_id,
|
||||
hops_from_seed=parent_page.hops_from_seed+1,
|
||||
via_page_id=parent_page.id)
|
||||
existing_child_page = self.page(new_child_page.id)
|
||||
if existing_child_page:
|
||||
existing_child_page.priority += new_child_page.priority
|
||||
self.update_page(existing_child_page)
|
||||
counts["updated"] += 1
|
||||
else:
|
||||
self.new_page(new_child_page)
|
||||
counts["added"] += 1
|
||||
for url in outlinks or []:
|
||||
surt_ = brozzler.site.to_surt(url)
|
||||
|
||||
if site.is_in_scope(surt_, parent_page):
|
||||
if brozzler.is_permitted_by_robots(site, url):
|
||||
if not surt_.startswith(site.scope["surt"]):
|
||||
hops_off_surt = parent_page.hops_off_surt + 1
|
||||
else:
|
||||
counts["blocked"] += 1
|
||||
hops_off_surt = 0
|
||||
new_child_page = brozzler.Page(
|
||||
url, site_id=site.id, job_id=site.job_id,
|
||||
hops_from_seed=parent_page.hops_from_seed+1,
|
||||
via_page_id=parent_page.id,
|
||||
hops_off_surt=hops_off_surt)
|
||||
existing_child_page = self.page(new_child_page.id)
|
||||
if existing_child_page:
|
||||
existing_child_page.priority += new_child_page.priority
|
||||
self.update_page(existing_child_page)
|
||||
counts["updated"] += 1
|
||||
else:
|
||||
self.new_page(new_child_page)
|
||||
counts["added"] += 1
|
||||
else:
|
||||
counts["rejected"] += 1
|
||||
counts["blocked"] += 1
|
||||
else:
|
||||
counts["rejected"] += 1
|
||||
|
||||
self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
|
||||
counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
|
||||
|
@ -60,30 +60,27 @@ class Site(brozzler.BaseDictable):
|
||||
def note_seed_redirect(self, url):
|
||||
new_scope_surt = self._to_surt(url)
|
||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
||||
self.logger.info("changing site scope surt from {} to {}".format(self.scope["surt"], new_scope_surt))
|
||||
self.logger.info("changing site scope surt from {} to {}".format(
|
||||
self.scope["surt"], new_scope_surt))
|
||||
self.scope["surt"] = new_scope_surt
|
||||
|
||||
def is_in_scope(self, url, parent_page=None):
|
||||
if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]:
|
||||
def is_in_scope(self, surt_, parent_page=None):
|
||||
if (parent_page and "max_hops" in self.scope
|
||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||
return False
|
||||
|
||||
try:
|
||||
hurl = surt.handyurl.parse(url)
|
||||
|
||||
# XXX doesn't belong here probably (where? worker ignores unknown schemes?)
|
||||
if hurl.scheme != "http" and hurl.scheme != "https":
|
||||
return False
|
||||
|
||||
surtt = surt.GoogleURLCanonicalizer.canonicalize(hurl).getURLString(surt=True, trailing_comma=True)
|
||||
return surtt.startswith(self.scope["surt"])
|
||||
except:
|
||||
self.logger.warn("problem parsing url %s", repr(url))
|
||||
elif surt_.startswith(self.scope["surt"]):
|
||||
return True
|
||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||
"max_hops_off_surt", 0):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
class Page(brozzler.BaseDictable):
|
||||
def __init__(self, url, id=None, site_id=None, job_id=None,
|
||||
hops_from_seed=0, redirect_url=None, priority=None, claimed=False,
|
||||
brozzle_count=0, via_page_id=None, last_claimed_by=None):
|
||||
def __init__(
|
||||
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
||||
via_page_id=None, last_claimed_by=None, hops_off_surt=0):
|
||||
self.site_id = site_id
|
||||
self.job_id = job_id
|
||||
self.url = url
|
||||
@ -93,6 +90,7 @@ class Page(brozzler.BaseDictable):
|
||||
self.last_claimed_by = last_claimed_by
|
||||
self.brozzle_count = brozzle_count
|
||||
self.via_page_id = via_page_id
|
||||
self.hops_off_surt = hops_off_surt
|
||||
self._canon_hurl = None
|
||||
|
||||
if priority is not None:
|
||||
@ -103,7 +101,8 @@ class Page(brozzler.BaseDictable):
|
||||
if id is not None:
|
||||
self.id = id
|
||||
else:
|
||||
digest_this = "site_id:{},canon_url:{}".format(self.site_id, self.canon_url())
|
||||
digest_this = "site_id:{},canon_url:{}".format(
|
||||
self.site_id, self.canon_url())
|
||||
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||
|
||||
def __repr__(self):
|
||||
@ -125,3 +124,7 @@ class Page(brozzler.BaseDictable):
|
||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||
return self._canon_hurl.geturl()
|
||||
|
||||
def to_surt(url):
|
||||
hurl = surt.handyurl.parse(url)
|
||||
return surt.GoogleURLCanonicalizer.canonicalize(
|
||||
hurl).getURLString(surt=True, trailing_comma=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user