mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 12:24:20 -04:00
renaming scope rule "host" to "domain" to make it a less confusing, since rules apply to subdomains as well
This commit is contained in:
parent
e64a4d6985
commit
77c800f6a2
2 changed files with 22 additions and 22 deletions
|
@ -1,20 +1,20 @@
|
||||||
#
|
'''
|
||||||
# brozzler/site.py - classes representing sites and pages
|
brozzler/site.py - classes representing sites and pages
|
||||||
#
|
|
||||||
# Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
# You may obtain a copy of the License at
|
You may obtain a copy of the License at
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
Unless required by applicable law or agreed to in writing, software
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
limitations under the License.
|
||||||
#
|
'''
|
||||||
|
|
||||||
import surt
|
import surt
|
||||||
import json
|
import json
|
||||||
|
@ -170,7 +170,7 @@ class Site(brozzler.BaseDictable):
|
||||||
Examples of valid rules:
|
Examples of valid rules:
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"host": "monkey.org",
|
"domain": "monkey.org",
|
||||||
"url_match": "STRING_MATCH",
|
"url_match": "STRING_MATCH",
|
||||||
"value": "bar",
|
"value": "bar",
|
||||||
},
|
},
|
||||||
|
@ -179,7 +179,7 @@ class Site(brozzler.BaseDictable):
|
||||||
"value": "http://(com,woop,)/fuh/",
|
"value": "http://(com,woop,)/fuh/",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"host": "badhost.com",
|
"domain": "bad.domain.com",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
"""
|
"""
|
||||||
|
@ -188,7 +188,7 @@ class Site(brozzler.BaseDictable):
|
||||||
else:
|
else:
|
||||||
u = url
|
u = url
|
||||||
|
|
||||||
if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
|
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
||||||
return False
|
return False
|
||||||
if "url_match" in rule:
|
if "url_match" in rule:
|
||||||
if rule["url_match"] == "STRING_MATCH":
|
if rule["url_match"] == "STRING_MATCH":
|
||||||
|
@ -207,7 +207,7 @@ class Site(brozzler.BaseDictable):
|
||||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
if "host" in rule:
|
if "domain" in rule:
|
||||||
# we already know that it matches from earlier check
|
# we already know that it matches from earlier check
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -21,7 +21,7 @@ import setuptools
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev27',
|
version='1.1.dev28',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue