mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
deprecate current scope rule syntax and create new syntax with slightly different semantics (to be documented), and add parent_url_regex scope rule; unit test for scoping
This commit is contained in:
parent
c0057e591a
commit
b409e49cfa
127
brozzler/site.py
127
brozzler/site.py
@ -201,54 +201,109 @@ class Site(brozzler.BaseDictable):
|
|||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _scope_rule_applies(self, rule, url):
|
def _normalize_rule(self, rule):
|
||||||
"""
|
"""
|
||||||
Examples of valid rules:
|
Normalizes a scope rule.
|
||||||
[
|
|
||||||
{
|
A scope rule is considered deprecated if it contains a `url_match` and
|
||||||
"domain": "monkey.org",
|
`value`. This method converts such scope rules to the preferred style
|
||||||
"url_match": "STRING_MATCH",
|
and returns the new rule. If `rule` is not a deprecated-style rule,
|
||||||
"value": "bar",
|
returns it unchanged.
|
||||||
},
|
"""
|
||||||
{
|
if "url_match" in rule and "value" in rule:
|
||||||
"url_match": "SURT_MATCH",
|
new_rule = dict(rule)
|
||||||
"value": "http://(com,woop,)/fuh/",
|
url_match = new_rule.pop("url_match")
|
||||||
},
|
if url_match == "REGEX_MATCH":
|
||||||
{
|
new_rule["regex"] = new_rule.pop("value")
|
||||||
"domain": "bad.domain.com",
|
elif url_match == "SURT_MATCH":
|
||||||
},
|
new_rule["surt"] = new_rule.pop("value")
|
||||||
]
|
elif url_match == "STRING_MATCH":
|
||||||
|
new_rule["substring"] = new_rule.pop("value")
|
||||||
|
else:
|
||||||
|
raise Exception("invalid scope rule")
|
||||||
|
return new_rule
|
||||||
|
else:
|
||||||
|
return rule
|
||||||
|
|
||||||
|
def _scope_rule_applies(self, rule, url, parent_page=None):
|
||||||
|
"""
|
||||||
|
Examples of valid rules expressed as yaml.
|
||||||
|
|
||||||
|
- domain: bad.domain.com
|
||||||
|
|
||||||
|
# preferred:
|
||||||
|
- domain: monkey.org
|
||||||
|
substring: bar
|
||||||
|
|
||||||
|
# deprecated version of the same:
|
||||||
|
- domain: monkey.org
|
||||||
|
url_match: STRING_MATCH
|
||||||
|
value: bar
|
||||||
|
|
||||||
|
# preferred:
|
||||||
|
- surt: http://(com,woop,)/fuh/
|
||||||
|
|
||||||
|
# deprecated version of the same:
|
||||||
|
- url_match: SURT_MATCH
|
||||||
|
value: http://(com,woop,)/fuh/
|
||||||
|
|
||||||
|
# preferred:
|
||||||
|
- regex: ^https?://(www.)?youtube.com/watch?.*$
|
||||||
|
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
|
||||||
|
|
||||||
|
# deprecated version of the same:
|
||||||
|
- url_match: REGEX_MATCH
|
||||||
|
value: ^https?://(www.)?youtube.com/watch?.*$
|
||||||
|
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
|
||||||
"""
|
"""
|
||||||
if not isinstance(url, Url):
|
if not isinstance(url, Url):
|
||||||
u = Url(url)
|
u = Url(url)
|
||||||
else:
|
else:
|
||||||
u = url
|
u = url
|
||||||
|
|
||||||
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
|
||||||
return False
|
|
||||||
if "url_match" in rule:
|
|
||||||
if rule["url_match"] == "STRING_MATCH":
|
|
||||||
return u.url.find(rule["value"]) >= 0
|
|
||||||
elif rule["url_match"] == "REGEX_MATCH":
|
|
||||||
try:
|
try:
|
||||||
return re.fullmatch(rule["value"], u.url)
|
rewl = self._normalize_rule(rule)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warn(
|
self.logger.error(
|
||||||
"caught exception matching against regex %s: %s",
|
"problem normalizing scope rule %s - %s", rule, e)
|
||||||
rule["value"], e)
|
|
||||||
return False
|
return False
|
||||||
elif rule["url_match"] == "SURT_MATCH":
|
|
||||||
return u.surt.startswith(rule["value"])
|
invalid_keys = rewl.keys() - {
|
||||||
else:
|
"domain", "surt", "substring", "regex", "parent_url_regex"}
|
||||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
if invalid_keys:
|
||||||
|
self.logger.error(
|
||||||
|
"invalid keys %s in scope rule %s", invalid_keys, rule)
|
||||||
return False
|
return False
|
||||||
else:
|
|
||||||
if "domain" in rule:
|
if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
|
||||||
# we already know that it matches from earlier check
|
return False
|
||||||
|
if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
|
||||||
|
return False
|
||||||
|
if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
|
||||||
|
return False
|
||||||
|
if "regex" in rewl:
|
||||||
|
try:
|
||||||
|
if not re.fullmatch(rewl["regex"], u.url):
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
"caught exception matching against regex %s - %s",
|
||||||
|
rewl["regex"], e)
|
||||||
|
return False
|
||||||
|
if "parent_url_regex" in rewl:
|
||||||
|
if not parent_page:
|
||||||
|
return False
|
||||||
|
pu = Url(parent_page.url)
|
||||||
|
try:
|
||||||
|
if not re.fullmatch(rule["parent_url_regex"], pu.url):
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
"caught exception matching against regex %s - %s",
|
||||||
|
rule["parent_url_regex"], e)
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
|
||||||
self.logger.warn("unable to make sense of scope rule %s", rule)
|
|
||||||
return False
|
|
||||||
|
|
||||||
class Page(brozzler.BaseDictable):
|
class Page(brozzler.BaseDictable):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev188',
|
version='1.1b9.dev189',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
test_units.py - some unit tests for parts of brozzler amenable to that
|
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016-2017 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -26,6 +26,7 @@ import brozzler.chrome
|
|||||||
import socket
|
import socket
|
||||||
import logging
|
import logging
|
||||||
import psutil
|
import psutil
|
||||||
|
import yaml
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
@ -73,3 +74,53 @@ def test_find_available_port():
|
|||||||
sock.close()
|
sock.close()
|
||||||
assert x._find_available_port(9800) == 9800
|
assert x._find_available_port(9800) == 9800
|
||||||
|
|
||||||
|
def test_scoping():
|
||||||
|
test_scope = yaml.load('''
|
||||||
|
max_hops: 100
|
||||||
|
accepts:
|
||||||
|
- url_match: REGEX_MATCH
|
||||||
|
value: ^.*/audio_file/.*\.mp3$
|
||||||
|
- url_match: SURT_MATCH
|
||||||
|
value: http://(com,vimeocdn,
|
||||||
|
- url_match: STRING_MATCH
|
||||||
|
value: ec-media.soundcloud.com
|
||||||
|
- regex: ^https?://twitter\.com.*$
|
||||||
|
- substring: facebook.com
|
||||||
|
- regex: ^https?://(www.)?youtube.com/watch?.*$
|
||||||
|
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
|
||||||
|
blocks:
|
||||||
|
- domain: twitter.com
|
||||||
|
url_match: REGEX_MATCH
|
||||||
|
value: ^.*lang=(?!en).*$
|
||||||
|
- bad_thing: bad rule should be ignored
|
||||||
|
''')
|
||||||
|
|
||||||
|
site = brozzler.Site(
|
||||||
|
seed='http://example.com/foo/bar?baz=quux#monkey', id=1,
|
||||||
|
scope=test_scope)
|
||||||
|
page = brozzler.Page(
|
||||||
|
url='http://example.com/foo/bar?baz=quux#monkey', site_id=site.id)
|
||||||
|
|
||||||
|
assert site.is_in_scope('http://example.com/foo/bar', page)
|
||||||
|
assert not site.is_in_scope('http://example.com/foo/baz', page)
|
||||||
|
|
||||||
|
assert not site.is_in_scope('http://foo.com/some.mp3', page)
|
||||||
|
assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
|
||||||
|
|
||||||
|
assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
|
||||||
|
assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
|
||||||
|
|
||||||
|
assert site.is_in_scope('https://twitter.com/twit', page)
|
||||||
|
assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
|
||||||
|
assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
|
||||||
|
|
||||||
|
assert site.is_in_scope('https://www.facebook.com/whatevz', page)
|
||||||
|
|
||||||
|
assert not site.is_in_scope(
|
||||||
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
|
||||||
|
yt_user_page = brozzler.Page(
|
||||||
|
url='https://www.youtube.com/user/SonoraSantaneraVEVO',
|
||||||
|
site_id=site.id, hops_from_seed=10)
|
||||||
|
assert site.is_in_scope(
|
||||||
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user