support for one-hop-off (or n-hop-off) scoping

This commit is contained in:
Noah Levitt 2016-04-21 17:41:30 +00:00
parent 7bc726f717
commit fee008266f
3 changed files with 47 additions and 38 deletions

View file

@ -231,26 +231,32 @@ class RethinkDbFrontier:
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
counts = {"added":0,"updated":0,"rejected":0,"blocked":0} counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
if outlinks: for url in outlinks or []:
for url in outlinks: surt_ = brozzler.site.to_surt(url)
if site.is_in_scope(url, parent_page):
if brozzler.is_permitted_by_robots(site, url): if site.is_in_scope(surt_, parent_page):
new_child_page = brozzler.Page( if brozzler.is_permitted_by_robots(site, url):
url, site_id=site.id, job_id=site.job_id, if not surt_.startswith(site.scope["surt"]):
hops_from_seed=parent_page.hops_from_seed+1, hops_off_surt = parent_page.hops_off_surt + 1
via_page_id=parent_page.id)
existing_child_page = self.page(new_child_page.id)
if existing_child_page:
existing_child_page.priority += new_child_page.priority
self.update_page(existing_child_page)
counts["updated"] += 1
else:
self.new_page(new_child_page)
counts["added"] += 1
else: else:
counts["blocked"] += 1 hops_off_surt = 0
new_child_page = brozzler.Page(
url, site_id=site.id, job_id=site.job_id,
hops_from_seed=parent_page.hops_from_seed+1,
via_page_id=parent_page.id,
hops_off_surt=hops_off_surt)
existing_child_page = self.page(new_child_page.id)
if existing_child_page:
existing_child_page.priority += new_child_page.priority
self.update_page(existing_child_page)
counts["updated"] += 1
else:
self.new_page(new_child_page)
counts["added"] += 1
else: else:
counts["rejected"] += 1 counts["blocked"] += 1
else:
counts["rejected"] += 1
self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s", self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page) counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)

View file

@ -60,30 +60,27 @@ class Site(brozzler.BaseDictable):
def note_seed_redirect(self, url): def note_seed_redirect(self, url):
new_scope_surt = self._to_surt(url) new_scope_surt = self._to_surt(url)
if not new_scope_surt.startswith(self.scope["surt"]): if not new_scope_surt.startswith(self.scope["surt"]):
self.logger.info("changing site scope surt from {} to {}".format(self.scope["surt"], new_scope_surt)) self.logger.info("changing site scope surt from {} to {}".format(
self.scope["surt"], new_scope_surt))
self.scope["surt"] = new_scope_surt self.scope["surt"] = new_scope_surt
def is_in_scope(self, url, parent_page=None): def is_in_scope(self, surt_, parent_page=None):
if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]: if (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]):
return False return False
elif surt_.startswith(self.scope["surt"]):
try: return True
hurl = surt.handyurl.parse(url) elif parent_page and parent_page.hops_off_surt < self.scope.get(
"max_hops_off_surt", 0):
# XXX doesn't belong here probably (where? worker ignores unknown schemes?) return True
if hurl.scheme != "http" and hurl.scheme != "https": else:
return False
surtt = surt.GoogleURLCanonicalizer.canonicalize(hurl).getURLString(surt=True, trailing_comma=True)
return surtt.startswith(self.scope["surt"])
except:
self.logger.warn("problem parsing url %s", repr(url))
return False return False
class Page(brozzler.BaseDictable): class Page(brozzler.BaseDictable):
def __init__(self, url, id=None, site_id=None, job_id=None, def __init__(
hops_from_seed=0, redirect_url=None, priority=None, claimed=False, self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
brozzle_count=0, via_page_id=None, last_claimed_by=None): redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0):
self.site_id = site_id self.site_id = site_id
self.job_id = job_id self.job_id = job_id
self.url = url self.url = url
@ -93,6 +90,7 @@ class Page(brozzler.BaseDictable):
self.last_claimed_by = last_claimed_by self.last_claimed_by = last_claimed_by
self.brozzle_count = brozzle_count self.brozzle_count = brozzle_count
self.via_page_id = via_page_id self.via_page_id = via_page_id
self.hops_off_surt = hops_off_surt
self._canon_hurl = None self._canon_hurl = None
if priority is not None: if priority is not None:
@ -103,7 +101,8 @@ class Page(brozzler.BaseDictable):
if id is not None: if id is not None:
self.id = id self.id = id
else: else:
digest_this = "site_id:{},canon_url:{}".format(self.site_id, self.canon_url()) digest_this = "site_id:{},canon_url:{}".format(
self.site_id, self.canon_url())
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest() self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def __repr__(self): def __repr__(self):
@ -125,3 +124,7 @@ class Page(brozzler.BaseDictable):
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl) surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
return self._canon_hurl.geturl() return self._canon_hurl.geturl()
def to_surt(url):
hurl = surt.handyurl.parse(url)
return surt.GoogleURLCanonicalizer.canonicalize(
hurl).getURLString(surt=True, trailing_comma=True)

View file

@ -2,7 +2,7 @@ import setuptools
import glob import glob
setuptools.setup(name='brozzler', setuptools.setup(name='brozzler',
version='1.1.dev3', version='1.1.dev4',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler', url='https://github.com/nlevitt/brozzler',
author='Noah Levitt', author='Noah Levitt',