tests for new approach without scope['surt']

replaced by an accept rule (two rules in some cases of seed redirects)
This commit is contained in:
Noah Levitt 2018-03-22 13:39:29 -07:00
parent 245e27a21a
commit 526a4d718f
2 changed files with 96 additions and 82 deletions

View file

@ -291,75 +291,80 @@ class RethinkDbFrontier:
{"start":doublethink.utcnow(), "stop":None}) {"start":doublethink.utcnow(), "stop":None})
site.save() site.save()
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign
+ url_for_crawling.fragment).decode('utf-8')
urlcanon.canon.remove_fragment(url_for_crawling)
page = brozzler.Page(self.rr, {
'url': str(url_for_crawling),
'site_id': site.id,
'job_id': site.job_id,
'hops_from_seed': parent_page.hops_from_seed + 1,
'via_page_id': parent_page.id,
'hops_off_surt': hops_off,
'hashtags': [hashtag] if hashtag else []})
return page
def _merge_page(self, existing_page, fresh_page):
'''
Utility method for merging info from `brozzler.Page` instances
representing the same url but with possibly different metadata.
'''
existing_page.priority += fresh_page.priority
existing_page.hashtags = list(set(
existing_page.hashtags + fresh_page.hashtags))
existing_page.hops_off_surt = min(
existing_page.hops_off_surt, fresh_page.hops_off_surt)
def _scope_and_enforce_robots(self, site, parent_page, outlinks): def _scope_and_enforce_robots(self, site, parent_page, outlinks):
''' '''
Returns tuple ( Returns tuple (
set of in scope urls (uncanonicalized) accepted by robots policy, dict of {page_id: Page} of fresh `brozzler.Page` representing in
scope links accepted by robots policy,
set of in scope urls (canonicalized) blocked by robots policy, set of in scope urls (canonicalized) blocked by robots policy,
set of out-of-scope urls (canonicalized)). set of out-of-scope urls (canonicalized)).
''' '''
in_scope = set() pages = {} # {page_id: Page, ...}
blocked = set() blocked = set()
out_of_scope = set() out_of_scope = set()
for url in outlinks or []: for url in outlinks or []:
url_for_scoping = urlcanon.semantic(url) url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url) url_for_crawling = urlcanon.whatwg(url)
urlcanon.canon.remove_fragment(url_for_crawling) decision = site.accept_reject_or_neither(
if site.is_in_scope(url_for_scoping, parent_page=parent_page): url_for_scoping, parent_page=parent_page)
if decision is True:
hops_off = 0
elif decision is None:
decision = parent_page.hops_off_surt < site.scope.get(
'max_hops_off_surt', 0)
hops_off = parent_page.hops_off_surt + 1
if decision is True:
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
in_scope.add(url) fresh_page = self._build_fresh_page(
site, parent_page, url, hops_off)
if fresh_page.id in pages:
self._merge_page(pages[fresh_page.id], fresh_page)
else:
pages[fresh_page.id] = fresh_page
else: else:
blocked.add(str(url_for_crawling)) blocked.add(str(url_for_crawling))
else: else:
out_of_scope.add(str(url_for_crawling)) out_of_scope.add(str(url_for_crawling))
return in_scope, blocked, out_of_scope return pages, blocked, out_of_scope
def _build_fresh_pages(self, site, parent_page, urls):
'''
Returns a dict of page_id => brozzler.Page.
'''
pages = {}
for url in urls:
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign
+ url_for_crawling.fragment).decode('utf-8')
urlcanon.canon.remove_fragment(url_for_crawling)
if not url_for_scoping.surt().startswith(
site.scope['surt'].encode('utf-8')):
hops_off_surt = parent_page.hops_off_surt + 1
else:
hops_off_surt = 0
page = brozzler.Page(self.rr, {
'url': str(url_for_crawling),
'site_id': site.id,
'job_id': site.job_id,
'hops_from_seed': parent_page.hops_from_seed + 1,
'via_page_id': parent_page.id,
'hops_off_surt': hops_off_surt,
'hashtags': []})
if page.id in pages:
pages[page.id].priority += page.priority
page = pages[page.id]
else:
pages[page.id] = page
if hashtag:
page.hashtags = list(set(page.hashtags + [hashtag]))
return pages
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
decisions = {'accepted':set(),'blocked':set(),'rejected':set()} decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
counts = {'added':0,'updated':0,'rejected':0,'blocked':0} counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
site, parent_page, outlinks) site, parent_page, outlinks)
decisions['blocked'] = blocked decisions['blocked'] = blocked
decisions['rejected'] = out_of_scope decisions['rejected'] = out_of_scope
counts['blocked'] += len(blocked) counts['blocked'] += len(blocked)
counts['rejected'] += len(out_of_scope) counts['rejected'] += len(out_of_scope)
fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)
# get existing pages from rethinkdb # get existing pages from rethinkdb
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}

View file

@ -183,9 +183,9 @@ class Site(doublethink.Document, ElapsedMixIn):
self.last_claimed = brozzler.EPOCH_UTC self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self: if not "scope" in self:
self.scope = {} self.scope = {}
if not "surt" in self.scope and self.seed: if self.seed:
self.scope["surt"] = brozzler.site_surt_canon( self._accept_ssurt_if_not_redundant(
self.seed).surt().decode('ascii') brozzler.site_surt_canon(self.seed).ssurt())
if not "starts_and_stops" in self: if not "starts_and_stops" in self:
if self.get("start_time"): # backward compatibility if self.get("start_time"): # backward compatibility
@ -201,14 +201,20 @@ class Site(doublethink.Document, ElapsedMixIn):
def __str__(self): def __str__(self):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
def note_seed_redirect(self, url): def _accept_ssurt_if_not_redundant(self, ssurt):
new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
if not "accepts" in self.scope: if not "accepts" in self.scope:
self.scope["accepts"] = [] self.scope["accepts"] = []
if not new_scope_surt.startswith(self.scope["surt"]): simple_rule_ssurts = (
rule["ssurt"] for rule in self.scope["accepts"]
if set(rule.keys()) == {'ssurt'})
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
self.logger.info( self.logger.info(
"adding surt %s to scope accept rules", new_scope_surt) "adding ssurt %s to scope accept rules", ssurt)
self.scope.accepts.append({"surt": new_scope_surt}) self.scope["accepts"].append({"ssurt": ssurt})
def note_seed_redirect(self, url):
self._accept_ssurt_if_not_redundant(
brozzler.site_surt_canon(url).ssurt())
def extra_headers(self): def extra_headers(self):
hdrs = {} hdrs = {}
@ -217,9 +223,20 @@ class Site(doublethink.Document, ElapsedMixIn):
self.warcprox_meta, separators=(',', ':')) self.warcprox_meta, separators=(',', ':'))
return hdrs return hdrs
def is_in_scope(self, url, parent_page=None): def accept_reject_or_neither(self, url, parent_page=None):
'''
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
`None` usually means rejected, unless `max_hops_off` comes into play.
'''
if not isinstance(url, urlcanon.ParsedUrl): if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url) url = urlcanon.semantic(url)
if not url.scheme in (b'http', b'https'):
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False
try_parent_urls = [] try_parent_urls = []
if parent_page: if parent_page:
try_parent_urls.append(urlcanon.semantic(parent_page.url)) try_parent_urls.append(urlcanon.semantic(parent_page.url))
@ -227,44 +244,36 @@ class Site(doublethink.Document, ElapsedMixIn):
try_parent_urls.append( try_parent_urls.append(
urlcanon.semantic(parent_page.redirect_url)) urlcanon.semantic(parent_page.redirect_url))
might_accept = False # enforce max_hops
if not url.scheme in (b'http', b'https'): if (parent_page and "max_hops" in self.scope
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False
elif (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]): and parent_page.hops_from_seed >= self.scope["max_hops"]):
pass return False
elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
might_accept = True # enforce reject rules
elif parent_page and parent_page.hops_off_surt < self.scope.get( if "blocks" in self.scope:
"max_hops_off_surt", 0): for block_rule in self.scope["blocks"]:
might_accept = True rule = urlcanon.MatchRule(**block_rule)
elif "accepts" in self.scope:
for accept_rule in self.scope["accepts"]:
rule = urlcanon.MatchRule(**accept_rule)
if try_parent_urls: if try_parent_urls:
for parent_url in try_parent_urls: for parent_url in try_parent_urls:
if rule.applies(url, parent_url): if rule.applies(url, parent_url):
might_accept = True return False
else: else:
if rule.applies(url): if rule.applies(url):
might_accept = True return False
if might_accept: # honor accept rules
if "blocks" in self.scope: for accept_rule in self.scope["accepts"]:
for block_rule in self.scope["blocks"]: rule = urlcanon.MatchRule(**accept_rule)
rule = urlcanon.MatchRule(**block_rule) if try_parent_urls:
if try_parent_urls: for parent_url in try_parent_urls:
for parent_url in try_parent_urls: if rule.applies(url, parent_url):
if rule.applies(url, parent_url): return True
return False else:
else: if rule.applies(url):
if rule.applies(url): return True
return False
return True # no decision if we reach here
else: return None
return False
class Page(doublethink.Document): class Page(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)