mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 12:24:20 -04:00
support for one-hop-off (or n-hop-off) scoping
This commit is contained in:
parent
7bc726f717
commit
fee008266f
3 changed files with 47 additions and 38 deletions
|
@ -231,14 +231,20 @@ class RethinkDbFrontier:
|
||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
if outlinks:
|
for url in outlinks or []:
|
||||||
for url in outlinks:
|
surt_ = brozzler.site.to_surt(url)
|
||||||
if site.is_in_scope(url, parent_page):
|
|
||||||
|
if site.is_in_scope(surt_, parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
|
if not surt_.startswith(site.scope["surt"]):
|
||||||
|
hops_off_surt = parent_page.hops_off_surt + 1
|
||||||
|
else:
|
||||||
|
hops_off_surt = 0
|
||||||
new_child_page = brozzler.Page(
|
new_child_page = brozzler.Page(
|
||||||
url, site_id=site.id, job_id=site.job_id,
|
url, site_id=site.id, job_id=site.job_id,
|
||||||
hops_from_seed=parent_page.hops_from_seed+1,
|
hops_from_seed=parent_page.hops_from_seed+1,
|
||||||
via_page_id=parent_page.id)
|
via_page_id=parent_page.id,
|
||||||
|
hops_off_surt=hops_off_surt)
|
||||||
existing_child_page = self.page(new_child_page.id)
|
existing_child_page = self.page(new_child_page.id)
|
||||||
if existing_child_page:
|
if existing_child_page:
|
||||||
existing_child_page.priority += new_child_page.priority
|
existing_child_page.priority += new_child_page.priority
|
||||||
|
|
|
@ -60,30 +60,27 @@ class Site(brozzler.BaseDictable):
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = self._to_surt(url)
|
new_scope_surt = self._to_surt(url)
|
||||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
if not new_scope_surt.startswith(self.scope["surt"]):
|
||||||
self.logger.info("changing site scope surt from {} to {}".format(self.scope["surt"], new_scope_surt))
|
self.logger.info("changing site scope surt from {} to {}".format(
|
||||||
|
self.scope["surt"], new_scope_surt))
|
||||||
self.scope["surt"] = new_scope_surt
|
self.scope["surt"] = new_scope_surt
|
||||||
|
|
||||||
def is_in_scope(self, url, parent_page=None):
|
def is_in_scope(self, surt_, parent_page=None):
|
||||||
if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]:
|
if (parent_page and "max_hops" in self.scope
|
||||||
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
return False
|
return False
|
||||||
|
elif surt_.startswith(self.scope["surt"]):
|
||||||
try:
|
return True
|
||||||
hurl = surt.handyurl.parse(url)
|
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||||
|
"max_hops_off_surt", 0):
|
||||||
# XXX doesn't belong here probably (where? worker ignores unknown schemes?)
|
return True
|
||||||
if hurl.scheme != "http" and hurl.scheme != "https":
|
else:
|
||||||
return False
|
|
||||||
|
|
||||||
surtt = surt.GoogleURLCanonicalizer.canonicalize(hurl).getURLString(surt=True, trailing_comma=True)
|
|
||||||
return surtt.startswith(self.scope["surt"])
|
|
||||||
except:
|
|
||||||
self.logger.warn("problem parsing url %s", repr(url))
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
class Page(brozzler.BaseDictable):
|
class Page(brozzler.BaseDictable):
|
||||||
def __init__(self, url, id=None, site_id=None, job_id=None,
|
def __init__(
|
||||||
hops_from_seed=0, redirect_url=None, priority=None, claimed=False,
|
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||||
brozzle_count=0, via_page_id=None, last_claimed_by=None):
|
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
||||||
|
via_page_id=None, last_claimed_by=None, hops_off_surt=0):
|
||||||
self.site_id = site_id
|
self.site_id = site_id
|
||||||
self.job_id = job_id
|
self.job_id = job_id
|
||||||
self.url = url
|
self.url = url
|
||||||
|
@ -93,6 +90,7 @@ class Page(brozzler.BaseDictable):
|
||||||
self.last_claimed_by = last_claimed_by
|
self.last_claimed_by = last_claimed_by
|
||||||
self.brozzle_count = brozzle_count
|
self.brozzle_count = brozzle_count
|
||||||
self.via_page_id = via_page_id
|
self.via_page_id = via_page_id
|
||||||
|
self.hops_off_surt = hops_off_surt
|
||||||
self._canon_hurl = None
|
self._canon_hurl = None
|
||||||
|
|
||||||
if priority is not None:
|
if priority is not None:
|
||||||
|
@ -103,7 +101,8 @@ class Page(brozzler.BaseDictable):
|
||||||
if id is not None:
|
if id is not None:
|
||||||
self.id = id
|
self.id = id
|
||||||
else:
|
else:
|
||||||
digest_this = "site_id:{},canon_url:{}".format(self.site_id, self.canon_url())
|
digest_this = "site_id:{},canon_url:{}".format(
|
||||||
|
self.site_id, self.canon_url())
|
||||||
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -125,3 +124,7 @@ class Page(brozzler.BaseDictable):
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||||
return self._canon_hurl.geturl()
|
return self._canon_hurl.geturl()
|
||||||
|
|
||||||
|
def to_surt(url):
|
||||||
|
hurl = surt.handyurl.parse(url)
|
||||||
|
return surt.GoogleURLCanonicalizer.canonicalize(
|
||||||
|
hurl).getURLString(surt=True, trailing_comma=True)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -2,7 +2,7 @@ import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.1.dev3',
|
version='1.1.dev4',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue