From 85a475752798db82c3b76a8b59cca4543bb95000 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 17:18:36 -0700 Subject: [PATCH] s/max_hops_off_surt/max_hops_off/ --- brozzler/frontier.py | 2 +- brozzler/job_schema.yaml | 2 +- brozzler/model.py | 5 +++++ job-conf.rst | 4 ++-- tests/test_frontier.py | 4 ++++ 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index b9785bc..2e076d3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -338,7 +338,7 @@ class RethinkDbFrontier: hops_off = 0 elif decision is None: decision = parent_page.hops_off < site.scope.get( - 'max_hops_off_surt', 0) + 'max_hops_off', 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 14445bc..6069de8 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -65,7 +65,7 @@ id: max_hops: type: integer - max_hops_off_surt: + max_hops_off: type: integer metadata: diff --git a/brozzler/model.py b/brozzler/model.py index 9c6f482..5e787dc 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -183,6 +183,11 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} + if ("max_hops_off_surt" in self.scope + and not "max_hops_off" in self.scope): + self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] + if "max_hops_off_surt" in self.scope: + del self.scope["max_hops_off_surt"] if self.seed: self._accept_ssurt_if_not_redundant( brozzler.site_surt_canon(self.seed).ssurt()) diff --git a/job-conf.rst b/job-conf.rst index 4a0dbf5..22c6992 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -271,7 +271,7 @@ are not. Example:: substring: wp-login.php?action=logout - domain: malware.us max_hops: 20 - max_hops_off_surt: 0 + max_hops_off: 0 Toward the end of the process of brozzling a page, brozzler obtains a list of navigational links (```` and similar) on the page, and evaluates @@ -356,7 +356,7 @@ List of scope rules. | number | no | *none* | +--------+----------+---------+ -``max_hops_off_surt`` +``max_hops_off`` ~~~~~~~~~~~~~~~~~~~~~ +--------+----------+---------+ | type | required | default | diff --git a/tests/test_frontier.py b/tests/test_frontier.py index c075edb..4906919 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -989,6 +989,10 @@ def test_max_hops_off(): brozzler.new_site(frontier, site) site.refresh() # get it back from the db + # renamed this param + assert not 'max_hops_off_surt' in site.scope + assert site.scope['max_hops_off'] == 1 + seed_page = frontier.seed_page(site.id) assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None