mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 07:15:52 -04:00
s/max_hops_off_surt/max_hops_off/
This commit is contained in:
parent
5ebd2fb709
commit
85a4757527
@ -338,7 +338,7 @@ class RethinkDbFrontier:
|
||||
hops_off = 0
|
||||
elif decision is None:
|
||||
decision = parent_page.hops_off < site.scope.get(
|
||||
'max_hops_off_surt', 0)
|
||||
'max_hops_off', 0)
|
||||
hops_off = parent_page.hops_off + 1
|
||||
if decision is True:
|
||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||
|
@ -65,7 +65,7 @@ id:
|
||||
max_hops:
|
||||
type: integer
|
||||
|
||||
max_hops_off_surt:
|
||||
max_hops_off:
|
||||
type: integer
|
||||
|
||||
metadata:
|
||||
|
@ -183,6 +183,11 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if not "scope" in self:
|
||||
self.scope = {}
|
||||
if ("max_hops_off_surt" in self.scope
|
||||
and not "max_hops_off" in self.scope):
|
||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||
if "max_hops_off_surt" in self.scope:
|
||||
del self.scope["max_hops_off_surt"]
|
||||
if self.seed:
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
brozzler.site_surt_canon(self.seed).ssurt())
|
||||
|
@ -271,7 +271,7 @@ are not. Example::
|
||||
substring: wp-login.php?action=logout
|
||||
- domain: malware.us
|
||||
max_hops: 20
|
||||
max_hops_off_surt: 0
|
||||
max_hops_off: 0
|
||||
|
||||
Toward the end of the process of brozzling a page, brozzler obtains a list of
|
||||
navigational links (``<a href="...">`` and similar) on the page, and evaluates
|
||||
@ -356,7 +356,7 @@ List of scope rules.
|
||||
| number | no | *none* |
|
||||
+--------+----------+---------+
|
||||
|
||||
``max_hops_off_surt``
|
||||
``max_hops_off``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
|
@ -989,6 +989,10 @@ def test_max_hops_off():
|
||||
brozzler.new_site(frontier, site)
|
||||
site.refresh() # get it back from the db
|
||||
|
||||
# renamed this param
|
||||
assert not 'max_hops_off_surt' in site.scope
|
||||
assert site.scope['max_hops_off'] == 1
|
||||
|
||||
seed_page = frontier.seed_page(site.id)
|
||||
|
||||
assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
|
||||
|
Loading…
x
Reference in New Issue
Block a user