deprecate current scope rule syntax and create new syntax with slightly different semantics (to be documented), and add parent_url_regex scope rule; unit test for scoping

This commit is contained in:
Noah Levitt 2017-02-15 16:46:45 -08:00
parent c0057e591a
commit b409e49cfa
3 changed files with 145 additions and 39 deletions

View File

@ -201,54 +201,109 @@ class Site(brozzler.BaseDictable):
else: else:
return False return False
def _scope_rule_applies(self, rule, url): def _normalize_rule(self, rule):
""" """
Examples of valid rules: Normalizes a scope rule.
[
{ A scope rule is considered deprecated if it contains a `url_match` and
"domain": "monkey.org", `value`. This method converts such scope rules to the preferred style
"url_match": "STRING_MATCH", and returns the new rule. If `rule` is not a deprecated-style rule,
"value": "bar", returns it unchanged.
}, """
{ if "url_match" in rule and "value" in rule:
"url_match": "SURT_MATCH", new_rule = dict(rule)
"value": "http://(com,woop,)/fuh/", url_match = new_rule.pop("url_match")
}, if url_match == "REGEX_MATCH":
{ new_rule["regex"] = new_rule.pop("value")
"domain": "bad.domain.com", elif url_match == "SURT_MATCH":
}, new_rule["surt"] = new_rule.pop("value")
] elif url_match == "STRING_MATCH":
new_rule["substring"] = new_rule.pop("value")
else:
raise Exception("invalid scope rule")
return new_rule
else:
return rule
def _scope_rule_applies(self, rule, url, parent_page=None):
"""
Examples of valid rules expressed as yaml.
- domain: bad.domain.com
# preferred:
- domain: monkey.org
substring: bar
# deprecated version of the same:
- domain: monkey.org
url_match: STRING_MATCH
value: bar
# preferred:
- surt: http://(com,woop,)/fuh/
# deprecated version of the same:
- url_match: SURT_MATCH
value: http://(com,woop,)/fuh/
# preferred:
- regex: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
# deprecated version of the same:
- url_match: REGEX_MATCH
value: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
""" """
if not isinstance(url, Url): if not isinstance(url, Url):
u = Url(url) u = Url(url)
else: else:
u = url u = url
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): try:
rewl = self._normalize_rule(rule)
except Exception as e:
self.logger.error(
"problem normalizing scope rule %s - %s", rule, e)
return False return False
if "url_match" in rule:
if rule["url_match"] == "STRING_MATCH": invalid_keys = rewl.keys() - {
return u.url.find(rule["value"]) >= 0 "domain", "surt", "substring", "regex", "parent_url_regex"}
elif rule["url_match"] == "REGEX_MATCH": if invalid_keys:
try: self.logger.error(
return re.fullmatch(rule["value"], u.url) "invalid keys %s in scope rule %s", invalid_keys, rule)
except Exception as e: return False
self.logger.warn(
"caught exception matching against regex %s: %s", if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
rule["value"], e) return False
if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
return False
if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
return False
if "regex" in rewl:
try:
if not re.fullmatch(rewl["regex"], u.url):
return False return False
elif rule["url_match"] == "SURT_MATCH": except Exception as e:
return u.surt.startswith(rule["value"]) self.logger.error(
else: "caught exception matching against regex %s - %s",
self.logger.warn("invalid rule.url_match=%s", rule.url_match) rewl["regex"], e)
return False return False
else: if "parent_url_regex" in rewl:
if "domain" in rule: if not parent_page:
# we already know that it matches from earlier check
return True
else:
self.logger.warn("unable to make sense of scope rule %s", rule)
return False return False
pu = Url(parent_page.url)
try:
if not re.fullmatch(rule["parent_url_regex"], pu.url):
return False
except Exception as e:
self.logger.error(
"caught exception matching against regex %s - %s",
rule["parent_url_regex"], e)
return False
return True
class Page(brozzler.BaseDictable): class Page(brozzler.BaseDictable):
def __init__( def __init__(

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev188', version='1.1b9.dev189',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View File

@ -2,7 +2,7 @@
''' '''
test_units.py - some unit tests for parts of brozzler amenable to that test_units.py - some unit tests for parts of brozzler amenable to that
Copyright (C) 2016 Internet Archive Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -26,6 +26,7 @@ import brozzler.chrome
import socket import socket
import logging import logging
import psutil import psutil
import yaml
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
@ -73,3 +74,53 @@ def test_find_available_port():
sock.close() sock.close()
assert x._find_available_port(9800) == 9800 assert x._find_available_port(9800) == 9800
def test_scoping():
test_scope = yaml.load('''
max_hops: 100
accepts:
- url_match: REGEX_MATCH
value: ^.*/audio_file/.*\.mp3$
- url_match: SURT_MATCH
value: http://(com,vimeocdn,
- url_match: STRING_MATCH
value: ec-media.soundcloud.com
- regex: ^https?://twitter\.com.*$
- substring: facebook.com
- regex: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
blocks:
- domain: twitter.com
url_match: REGEX_MATCH
value: ^.*lang=(?!en).*$
- bad_thing: bad rule should be ignored
''')
site = brozzler.Site(
seed='http://example.com/foo/bar?baz=quux#monkey', id=1,
scope=test_scope)
page = brozzler.Page(
url='http://example.com/foo/bar?baz=quux#monkey', site_id=site.id)
assert site.is_in_scope('http://example.com/foo/bar', page)
assert not site.is_in_scope('http://example.com/foo/baz', page)
assert not site.is_in_scope('http://foo.com/some.mp3', page)
assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
assert site.is_in_scope('https://twitter.com/twit', page)
assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
assert site.is_in_scope('https://www.facebook.com/whatevz', page)
assert not site.is_in_scope(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
yt_user_page = brozzler.Page(
url='https://www.youtube.com/user/SonoraSantaneraVEVO',
site_id=site.id, hops_from_seed=10)
assert site.is_in_scope(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)