From b409e49cfaed54d9cfe5643f8eede78fdebd3566 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 15 Feb 2017 16:46:45 -0800 Subject: [PATCH] deprecate current scope rule syntax and create new syntax with slightly different semantics (to be documented), and add parent_url_regex scope rule; unit test for scoping --- brozzler/site.py | 129 +++++++++++++++++++++++++++++++------------- setup.py | 2 +- tests/test_units.py | 53 +++++++++++++++++- 3 files changed, 145 insertions(+), 39 deletions(-) diff --git a/brozzler/site.py b/brozzler/site.py index 17f8983..192a4fe 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -201,54 +201,109 @@ class Site(brozzler.BaseDictable): else: return False - def _scope_rule_applies(self, rule, url): + def _normalize_rule(self, rule): """ - Examples of valid rules: - [ - { - "domain": "monkey.org", - "url_match": "STRING_MATCH", - "value": "bar", - }, - { - "url_match": "SURT_MATCH", - "value": "http://(com,woop,)/fuh/", - }, - { - "domain": "bad.domain.com", - }, - ] + Normalizes a scope rule. + + A scope rule is considered deprecated if it contains a `url_match` and + `value`. This method converts such scope rules to the preferred style + and returns the new rule. If `rule` is not a deprecated-style rule, + returns it unchanged. + """ + if "url_match" in rule and "value" in rule: + new_rule = dict(rule) + url_match = new_rule.pop("url_match") + if url_match == "REGEX_MATCH": + new_rule["regex"] = new_rule.pop("value") + elif url_match == "SURT_MATCH": + new_rule["surt"] = new_rule.pop("value") + elif url_match == "STRING_MATCH": + new_rule["substring"] = new_rule.pop("value") + else: + raise Exception("invalid scope rule") + return new_rule + else: + return rule + + def _scope_rule_applies(self, rule, url, parent_page=None): + """ + Examples of valid rules expressed as yaml. + + - domain: bad.domain.com + + # preferred: + - domain: monkey.org + substring: bar + + # deprecated version of the same: + - domain: monkey.org + url_match: STRING_MATCH + value: bar + + # preferred: + - surt: http://(com,woop,)/fuh/ + + # deprecated version of the same: + - url_match: SURT_MATCH + value: http://(com,woop,)/fuh/ + + # preferred: + - regex: ^https?://(www.)?youtube.com/watch?.*$ + parent_url_regex: ^https?://(www.)?youtube.com/user/.*$ + + # deprecated version of the same: + - url_match: REGEX_MATCH + value: ^https?://(www.)?youtube.com/watch?.*$ + parent_url_regex: ^https?://(www.)?youtube.com/user/.*$ """ if not isinstance(url, Url): u = Url(url) else: u = url - if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): + try: + rewl = self._normalize_rule(rule) + except Exception as e: + self.logger.error( + "problem normalizing scope rule %s - %s", rule, e) return False - if "url_match" in rule: - if rule["url_match"] == "STRING_MATCH": - return u.url.find(rule["value"]) >= 0 - elif rule["url_match"] == "REGEX_MATCH": - try: - return re.fullmatch(rule["value"], u.url) - except Exception as e: - self.logger.warn( - "caught exception matching against regex %s: %s", - rule["value"], e) + + invalid_keys = rewl.keys() - { + "domain", "surt", "substring", "regex", "parent_url_regex"} + if invalid_keys: + self.logger.error( + "invalid keys %s in scope rule %s", invalid_keys, rule) + return False + + if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]): + return False + if "surt" in rewl and not u.surt.startswith(rewl["surt"]): + return False + if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0: + return False + if "regex" in rewl: + try: + if not re.fullmatch(rewl["regex"], u.url): return False - elif rule["url_match"] == "SURT_MATCH": - return u.surt.startswith(rule["value"]) - else: - self.logger.warn("invalid rule.url_match=%s", rule.url_match) + except Exception as e: + self.logger.error( + "caught exception matching against regex %s - %s", + rewl["regex"], e) return False - else: - if "domain" in rule: - # we already know that it matches from earlier check - return True - else: - self.logger.warn("unable to make sense of scope rule %s", rule) + if "parent_url_regex" in rewl: + if not parent_page: return False + pu = Url(parent_page.url) + try: + if not re.fullmatch(rule["parent_url_regex"], pu.url): + return False + except Exception as e: + self.logger.error( + "caught exception matching against regex %s - %s", + rule["parent_url_regex"], e) + return False + + return True class Page(brozzler.BaseDictable): def __init__( diff --git a/setup.py b/setup.py index f8e7dbc..9c0d2b5 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev188', + version='1.1b9.dev189', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py index ca47eb0..fc24a99 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -2,7 +2,7 @@ ''' test_units.py - some unit tests for parts of brozzler amenable to that -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ import brozzler.chrome import socket import logging import psutil +import yaml @pytest.fixture(scope='module') def httpd(request): @@ -73,3 +74,53 @@ def test_find_available_port(): sock.close() assert x._find_available_port(9800) == 9800 +def test_scoping(): + test_scope = yaml.load(''' +max_hops: 100 +accepts: +- url_match: REGEX_MATCH + value: ^.*/audio_file/.*\.mp3$ +- url_match: SURT_MATCH + value: http://(com,vimeocdn, +- url_match: STRING_MATCH + value: ec-media.soundcloud.com +- regex: ^https?://twitter\.com.*$ +- substring: facebook.com +- regex: ^https?://(www.)?youtube.com/watch?.*$ + parent_url_regex: ^https?://(www.)?youtube.com/user/.*$ +blocks: +- domain: twitter.com + url_match: REGEX_MATCH + value: ^.*lang=(?!en).*$ +- bad_thing: bad rule should be ignored +''') + + site = brozzler.Site( + seed='http://example.com/foo/bar?baz=quux#monkey', id=1, + scope=test_scope) + page = brozzler.Page( + url='http://example.com/foo/bar?baz=quux#monkey', site_id=site.id) + + assert site.is_in_scope('http://example.com/foo/bar', page) + assert not site.is_in_scope('http://example.com/foo/baz', page) + + assert not site.is_in_scope('http://foo.com/some.mp3', page) + assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page) + + assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page) + assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page) + + assert site.is_in_scope('https://twitter.com/twit', page) + assert site.is_in_scope('https://twitter.com/twit?lang=en', page) + assert not site.is_in_scope('https://twitter.com/twit?lang=es', page) + + assert site.is_in_scope('https://www.facebook.com/whatevz', page) + + assert not site.is_in_scope( + 'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) + yt_user_page = brozzler.Page( + url='https://www.youtube.com/user/SonoraSantaneraVEVO', + site_id=site.id, hops_from_seed=10) + assert site.is_in_scope( + 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) +