diff --git a/brozzler/frontier.py b/brozzler/frontier.py index a0f8ab4..5276f72 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -291,75 +291,80 @@ class RethinkDbFrontier: {"start":doublethink.utcnow(), "stop":None}) site.save() + def _build_fresh_page(self, site, parent_page, url, hops_off=0): + url_for_scoping = urlcanon.semantic(url) + url_for_crawling = urlcanon.whatwg(url) + hashtag = (url_for_crawling.hash_sign + + url_for_crawling.fragment).decode('utf-8') + urlcanon.canon.remove_fragment(url_for_crawling) + page = brozzler.Page(self.rr, { + 'url': str(url_for_crawling), + 'site_id': site.id, + 'job_id': site.job_id, + 'hops_from_seed': parent_page.hops_from_seed + 1, + 'via_page_id': parent_page.id, + 'hops_off_surt': hops_off, + 'hashtags': [hashtag] if hashtag else []}) + return page + + def _merge_page(self, existing_page, fresh_page): + ''' + Utility method for merging info from `brozzler.Page` instances + representing the same url but with possibly different metadata. + ''' + existing_page.priority += fresh_page.priority + existing_page.hashtags = list(set( + existing_page.hashtags + fresh_page.hashtags)) + existing_page.hops_off_surt = min( + existing_page.hops_off_surt, fresh_page.hops_off_surt) + def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( - set of in scope urls (uncanonicalized) accepted by robots policy, + dict of {page_id: Page} of fresh `brozzler.Page` representing in + scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' - in_scope = set() + pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) - urlcanon.canon.remove_fragment(url_for_crawling) - if site.is_in_scope(url_for_scoping, parent_page=parent_page): + decision = site.accept_reject_or_neither( + url_for_scoping, parent_page=parent_page) + if decision is True: + hops_off = 0 + elif decision is None: + decision = parent_page.hops_off_surt < site.scope.get( + 'max_hops_off_surt', 0) + hops_off = parent_page.hops_off_surt + 1 + if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): - in_scope.add(url) + fresh_page = self._build_fresh_page( + site, parent_page, url, hops_off) + if fresh_page.id in pages: + self._merge_page(pages[fresh_page.id], fresh_page) + else: + pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) - return in_scope, blocked, out_of_scope - - def _build_fresh_pages(self, site, parent_page, urls): - ''' - Returns a dict of page_id => brozzler.Page. - ''' - pages = {} - for url in urls: - url_for_scoping = urlcanon.semantic(url) - url_for_crawling = urlcanon.whatwg(url) - hashtag = (url_for_crawling.hash_sign - + url_for_crawling.fragment).decode('utf-8') - urlcanon.canon.remove_fragment(url_for_crawling) - if not url_for_scoping.surt().startswith( - site.scope['surt'].encode('utf-8')): - hops_off_surt = parent_page.hops_off_surt + 1 - else: - hops_off_surt = 0 - page = brozzler.Page(self.rr, { - 'url': str(url_for_crawling), - 'site_id': site.id, - 'job_id': site.job_id, - 'hops_from_seed': parent_page.hops_from_seed + 1, - 'via_page_id': parent_page.id, - 'hops_off_surt': hops_off_surt, - 'hashtags': []}) - if page.id in pages: - pages[page.id].priority += page.priority - page = pages[page.id] - else: - pages[page.id] = page - if hashtag: - page.hashtags = list(set(page.hashtags + [hashtag])) - return pages + return pages, blocked, out_of_scope def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {'accepted':set(),'blocked':set(),'rejected':set()} counts = {'added':0,'updated':0,'rejected':0,'blocked':0} - in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( + fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots( site, parent_page, outlinks) decisions['blocked'] = blocked decisions['rejected'] = out_of_scope counts['blocked'] += len(blocked) counts['rejected'] += len(out_of_scope) - fresh_pages = self._build_fresh_pages(site, parent_page, in_scope) - # get existing pages from rethinkdb results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} diff --git a/brozzler/model.py b/brozzler/model.py index 96c4ca7..6b35bc2 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -183,9 +183,9 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} - if not "surt" in self.scope and self.seed: - self.scope["surt"] = brozzler.site_surt_canon( - self.seed).surt().decode('ascii') + if self.seed: + self._accept_ssurt_if_not_redundant( + brozzler.site_surt_canon(self.seed).ssurt()) if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility @@ -201,14 +201,20 @@ class Site(doublethink.Document, ElapsedMixIn): def __str__(self): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) - def note_seed_redirect(self, url): - new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii") + def _accept_ssurt_if_not_redundant(self, ssurt): if not "accepts" in self.scope: self.scope["accepts"] = [] - if not new_scope_surt.startswith(self.scope["surt"]): + simple_rule_ssurts = ( + rule["ssurt"] for rule in self.scope["accepts"] + if set(rule.keys()) == {'ssurt'}) + if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts): self.logger.info( - "adding surt %s to scope accept rules", new_scope_surt) - self.scope.accepts.append({"surt": new_scope_surt}) + "adding ssurt %s to scope accept rules", ssurt) + self.scope["accepts"].append({"ssurt": ssurt}) + + def note_seed_redirect(self, url): + self._accept_ssurt_if_not_redundant( + brozzler.site_surt_canon(url).ssurt()) def extra_headers(self): hdrs = {} @@ -217,9 +223,20 @@ class Site(doublethink.Document, ElapsedMixIn): self.warcprox_meta, separators=(',', ':')) return hdrs - def is_in_scope(self, url, parent_page=None): + def accept_reject_or_neither(self, url, parent_page=None): + ''' + Returns `True` (accepted), `False` (rejected), or `None` (no decision). + + `None` usually means rejected, unless `max_hops_off` comes into play. + ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) + + if not url.scheme in (b'http', b'https'): + # XXX doesn't belong here maybe (where? worker ignores unknown + # schemes?) + return False + try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) @@ -227,44 +244,36 @@ class Site(doublethink.Document, ElapsedMixIn): try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) - might_accept = False - if not url.scheme in (b'http', b'https'): - # XXX doesn't belong here maybe (where? worker ignores unknown - # schemes?) - return False - elif (parent_page and "max_hops" in self.scope + # enforce max_hops + if (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): - pass - elif url.surt().startswith(self.scope["surt"].encode("utf-8")): - might_accept = True - elif parent_page and parent_page.hops_off_surt < self.scope.get( - "max_hops_off_surt", 0): - might_accept = True - elif "accepts" in self.scope: - for accept_rule in self.scope["accepts"]: - rule = urlcanon.MatchRule(**accept_rule) + return False + + # enforce reject rules + if "blocks" in self.scope: + for block_rule in self.scope["blocks"]: + rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): - might_accept = True + return False else: if rule.applies(url): - might_accept = True + return False - if might_accept: - if "blocks" in self.scope: - for block_rule in self.scope["blocks"]: - rule = urlcanon.MatchRule(**block_rule) - if try_parent_urls: - for parent_url in try_parent_urls: - if rule.applies(url, parent_url): - return False - else: - if rule.applies(url): - return False - return True - else: - return False + # honor accept rules + for accept_rule in self.scope["accepts"]: + rule = urlcanon.MatchRule(**accept_rule) + if try_parent_urls: + for parent_url in try_parent_urls: + if rule.applies(url, parent_url): + return True + else: + if rule.applies(url): + return True + + # no decision if we reach here + return None class Page(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__)