mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-01 02:56:10 -04:00
tests for new approach without scope['surt']
replaced by an accept rule (two rules in some cases of seed redirects)
This commit is contained in:
parent
245e27a21a
commit
526a4d718f
2 changed files with 96 additions and 82 deletions
|
@ -291,75 +291,80 @@ class RethinkDbFrontier:
|
||||||
{"start":doublethink.utcnow(), "stop":None})
|
{"start":doublethink.utcnow(), "stop":None})
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||||
'''
|
|
||||||
Returns tuple (
|
|
||||||
set of in scope urls (uncanonicalized) accepted by robots policy,
|
|
||||||
set of in scope urls (canonicalized) blocked by robots policy,
|
|
||||||
set of out-of-scope urls (canonicalized)).
|
|
||||||
'''
|
|
||||||
in_scope = set()
|
|
||||||
blocked = set()
|
|
||||||
out_of_scope = set()
|
|
||||||
for url in outlinks or []:
|
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
|
||||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
|
||||||
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
|
||||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
|
||||||
in_scope.add(url)
|
|
||||||
else:
|
|
||||||
blocked.add(str(url_for_crawling))
|
|
||||||
else:
|
|
||||||
out_of_scope.add(str(url_for_crawling))
|
|
||||||
return in_scope, blocked, out_of_scope
|
|
||||||
|
|
||||||
def _build_fresh_pages(self, site, parent_page, urls):
|
|
||||||
'''
|
|
||||||
Returns a dict of page_id => brozzler.Page.
|
|
||||||
'''
|
|
||||||
pages = {}
|
|
||||||
for url in urls:
|
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
hashtag = (url_for_crawling.hash_sign
|
hashtag = (url_for_crawling.hash_sign
|
||||||
+ url_for_crawling.fragment).decode('utf-8')
|
+ url_for_crawling.fragment).decode('utf-8')
|
||||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||||
if not url_for_scoping.surt().startswith(
|
|
||||||
site.scope['surt'].encode('utf-8')):
|
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
|
||||||
else:
|
|
||||||
hops_off_surt = 0
|
|
||||||
page = brozzler.Page(self.rr, {
|
page = brozzler.Page(self.rr, {
|
||||||
'url': str(url_for_crawling),
|
'url': str(url_for_crawling),
|
||||||
'site_id': site.id,
|
'site_id': site.id,
|
||||||
'job_id': site.job_id,
|
'job_id': site.job_id,
|
||||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||||
'via_page_id': parent_page.id,
|
'via_page_id': parent_page.id,
|
||||||
'hops_off_surt': hops_off_surt,
|
'hops_off_surt': hops_off,
|
||||||
'hashtags': []})
|
'hashtags': [hashtag] if hashtag else []})
|
||||||
if page.id in pages:
|
return page
|
||||||
pages[page.id].priority += page.priority
|
|
||||||
page = pages[page.id]
|
def _merge_page(self, existing_page, fresh_page):
|
||||||
|
'''
|
||||||
|
Utility method for merging info from `brozzler.Page` instances
|
||||||
|
representing the same url but with possibly different metadata.
|
||||||
|
'''
|
||||||
|
existing_page.priority += fresh_page.priority
|
||||||
|
existing_page.hashtags = list(set(
|
||||||
|
existing_page.hashtags + fresh_page.hashtags))
|
||||||
|
existing_page.hops_off_surt = min(
|
||||||
|
existing_page.hops_off_surt, fresh_page.hops_off_surt)
|
||||||
|
|
||||||
|
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||||
|
'''
|
||||||
|
Returns tuple (
|
||||||
|
dict of {page_id: Page} of fresh `brozzler.Page` representing in
|
||||||
|
scope links accepted by robots policy,
|
||||||
|
set of in scope urls (canonicalized) blocked by robots policy,
|
||||||
|
set of out-of-scope urls (canonicalized)).
|
||||||
|
'''
|
||||||
|
pages = {} # {page_id: Page, ...}
|
||||||
|
blocked = set()
|
||||||
|
out_of_scope = set()
|
||||||
|
for url in outlinks or []:
|
||||||
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
|
decision = site.accept_reject_or_neither(
|
||||||
|
url_for_scoping, parent_page=parent_page)
|
||||||
|
if decision is True:
|
||||||
|
hops_off = 0
|
||||||
|
elif decision is None:
|
||||||
|
decision = parent_page.hops_off_surt < site.scope.get(
|
||||||
|
'max_hops_off_surt', 0)
|
||||||
|
hops_off = parent_page.hops_off_surt + 1
|
||||||
|
if decision is True:
|
||||||
|
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||||
|
fresh_page = self._build_fresh_page(
|
||||||
|
site, parent_page, url, hops_off)
|
||||||
|
if fresh_page.id in pages:
|
||||||
|
self._merge_page(pages[fresh_page.id], fresh_page)
|
||||||
else:
|
else:
|
||||||
pages[page.id] = page
|
pages[fresh_page.id] = fresh_page
|
||||||
if hashtag:
|
else:
|
||||||
page.hashtags = list(set(page.hashtags + [hashtag]))
|
blocked.add(str(url_for_crawling))
|
||||||
return pages
|
else:
|
||||||
|
out_of_scope.add(str(url_for_crawling))
|
||||||
|
return pages, blocked, out_of_scope
|
||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
||||||
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
||||||
|
|
||||||
in_scope, blocked, out_of_scope = self._scope_and_enforce_robots(
|
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
|
||||||
site, parent_page, outlinks)
|
site, parent_page, outlinks)
|
||||||
decisions['blocked'] = blocked
|
decisions['blocked'] = blocked
|
||||||
decisions['rejected'] = out_of_scope
|
decisions['rejected'] = out_of_scope
|
||||||
counts['blocked'] += len(blocked)
|
counts['blocked'] += len(blocked)
|
||||||
counts['rejected'] += len(out_of_scope)
|
counts['rejected'] += len(out_of_scope)
|
||||||
|
|
||||||
fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)
|
|
||||||
|
|
||||||
# get existing pages from rethinkdb
|
# get existing pages from rethinkdb
|
||||||
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
||||||
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
||||||
|
|
|
@ -183,9 +183,9 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
self.last_claimed = brozzler.EPOCH_UTC
|
self.last_claimed = brozzler.EPOCH_UTC
|
||||||
if not "scope" in self:
|
if not "scope" in self:
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if not "surt" in self.scope and self.seed:
|
if self.seed:
|
||||||
self.scope["surt"] = brozzler.site_surt_canon(
|
self._accept_ssurt_if_not_redundant(
|
||||||
self.seed).surt().decode('ascii')
|
brozzler.site_surt_canon(self.seed).ssurt())
|
||||||
|
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("start_time"): # backward compatibility
|
if self.get("start_time"): # backward compatibility
|
||||||
|
@ -201,14 +201,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def _accept_ssurt_if_not_redundant(self, ssurt):
|
||||||
new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
|
|
||||||
if not "accepts" in self.scope:
|
if not "accepts" in self.scope:
|
||||||
self.scope["accepts"] = []
|
self.scope["accepts"] = []
|
||||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
simple_rule_ssurts = (
|
||||||
|
rule["ssurt"] for rule in self.scope["accepts"]
|
||||||
|
if set(rule.keys()) == {'ssurt'})
|
||||||
|
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"adding surt %s to scope accept rules", new_scope_surt)
|
"adding ssurt %s to scope accept rules", ssurt)
|
||||||
self.scope.accepts.append({"surt": new_scope_surt})
|
self.scope["accepts"].append({"ssurt": ssurt})
|
||||||
|
|
||||||
|
def note_seed_redirect(self, url):
|
||||||
|
self._accept_ssurt_if_not_redundant(
|
||||||
|
brozzler.site_surt_canon(url).ssurt())
|
||||||
|
|
||||||
def extra_headers(self):
|
def extra_headers(self):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
|
@ -217,9 +223,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
self.warcprox_meta, separators=(',', ':'))
|
self.warcprox_meta, separators=(',', ':'))
|
||||||
return hdrs
|
return hdrs
|
||||||
|
|
||||||
def is_in_scope(self, url, parent_page=None):
|
def accept_reject_or_neither(self, url, parent_page=None):
|
||||||
|
'''
|
||||||
|
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
|
||||||
|
|
||||||
|
`None` usually means rejected, unless `max_hops_off` comes into play.
|
||||||
|
'''
|
||||||
if not isinstance(url, urlcanon.ParsedUrl):
|
if not isinstance(url, urlcanon.ParsedUrl):
|
||||||
url = urlcanon.semantic(url)
|
url = urlcanon.semantic(url)
|
||||||
|
|
||||||
|
if not url.scheme in (b'http', b'https'):
|
||||||
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
|
# schemes?)
|
||||||
|
return False
|
||||||
|
|
||||||
try_parent_urls = []
|
try_parent_urls = []
|
||||||
if parent_page:
|
if parent_page:
|
||||||
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||||
|
@ -227,31 +244,12 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
try_parent_urls.append(
|
try_parent_urls.append(
|
||||||
urlcanon.semantic(parent_page.redirect_url))
|
urlcanon.semantic(parent_page.redirect_url))
|
||||||
|
|
||||||
might_accept = False
|
# enforce max_hops
|
||||||
if not url.scheme in (b'http', b'https'):
|
if (parent_page and "max_hops" in self.scope
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
|
||||||
# schemes?)
|
|
||||||
return False
|
|
||||||
elif (parent_page and "max_hops" in self.scope
|
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
pass
|
return False
|
||||||
elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
|
|
||||||
might_accept = True
|
|
||||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
|
||||||
"max_hops_off_surt", 0):
|
|
||||||
might_accept = True
|
|
||||||
elif "accepts" in self.scope:
|
|
||||||
for accept_rule in self.scope["accepts"]:
|
|
||||||
rule = urlcanon.MatchRule(**accept_rule)
|
|
||||||
if try_parent_urls:
|
|
||||||
for parent_url in try_parent_urls:
|
|
||||||
if rule.applies(url, parent_url):
|
|
||||||
might_accept = True
|
|
||||||
else:
|
|
||||||
if rule.applies(url):
|
|
||||||
might_accept = True
|
|
||||||
|
|
||||||
if might_accept:
|
# enforce reject rules
|
||||||
if "blocks" in self.scope:
|
if "blocks" in self.scope:
|
||||||
for block_rule in self.scope["blocks"]:
|
for block_rule in self.scope["blocks"]:
|
||||||
rule = urlcanon.MatchRule(**block_rule)
|
rule = urlcanon.MatchRule(**block_rule)
|
||||||
|
@ -262,9 +260,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
else:
|
else:
|
||||||
if rule.applies(url):
|
if rule.applies(url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# honor accept rules
|
||||||
|
for accept_rule in self.scope["accepts"]:
|
||||||
|
rule = urlcanon.MatchRule(**accept_rule)
|
||||||
|
if try_parent_urls:
|
||||||
|
for parent_url in try_parent_urls:
|
||||||
|
if rule.applies(url, parent_url):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
if rule.applies(url):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# no decision if we reach here
|
||||||
|
return None
|
||||||
|
|
||||||
class Page(doublethink.Document):
|
class Page(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue