s/max_hops_off_surt/max_hops_off/

This commit is contained in:
Noah Levitt 2018-03-22 17:18:36 -07:00
parent 5ebd2fb709
commit 85a4757527
5 changed files with 13 additions and 4 deletions

View File

@ -338,7 +338,7 @@ class RethinkDbFrontier:
hops_off = 0
elif decision is None:
decision = parent_page.hops_off < site.scope.get(
'max_hops_off_surt', 0)
'max_hops_off', 0)
hops_off = parent_page.hops_off + 1
if decision is True:
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):

View File

@ -65,7 +65,7 @@ id:
max_hops:
type: integer
max_hops_off_surt:
max_hops_off:
type: integer
metadata:

View File

@ -183,6 +183,11 @@ class Site(doublethink.Document, ElapsedMixIn):
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
self.scope = {}
if ("max_hops_off_surt" in self.scope
and not "max_hops_off" in self.scope):
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
if self.seed:
self._accept_ssurt_if_not_redundant(
brozzler.site_surt_canon(self.seed).ssurt())

View File

@ -271,7 +271,7 @@ are not. Example::
substring: wp-login.php?action=logout
- domain: malware.us
max_hops: 20
max_hops_off_surt: 0
max_hops_off: 0
Toward the end of the process of brozzling a page, brozzler obtains a list of
navigational links (``<a href="...">`` and similar) on the page, and evaluates
@ -356,7 +356,7 @@ List of scope rules.
| number | no | *none* |
+--------+----------+---------+
``max_hops_off_surt``
``max_hops_off``
~~~~~~~~~~~~~~~~~~~~~
+--------+----------+---------+
| type | required | default |

View File

@ -989,6 +989,10 @@ def test_max_hops_off():
brozzler.new_site(frontier, site)
site.refresh() # get it back from the db
# renamed this param
assert not 'max_hops_off_surt' in site.scope
assert site.scope['max_hops_off'] == 1
seed_page = frontier.seed_page(site.id)
assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None