deprecate current scope rule syntax and create new syntax with slightly different semantics (to be documented), and add parent_url_regex scope rule; unit test for scoping

2025-07-11 00:59:35 -04:00 · 2017-02-15 16:46:45 -08:00 · 2017-02-15 16:46:45 -08:00 · b409e49cfa
commit b409e49cfa
parent c0057e591a
3 changed files with 145 additions and 39 deletions
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -201,54 +201,109 @@ class Site(brozzler.BaseDictable):
        else:
            return False
-    def _scope_rule_applies(self, rule, url):
+    def _normalize_rule(self, rule):
        """
-        Examples of valid rules:
+        Normalizes a scope rule.
-        [
+
-            {
+        A scope rule is considered deprecated if it contains a `url_match` and
-                "domain": "monkey.org",
+        `value`. This method converts such scope rules to the preferred style
-                "url_match": "STRING_MATCH",
+        and returns the new rule. If `rule` is not a deprecated-style rule,
-                "value": "bar",
+        returns  it unchanged.
-            },
+        """
-            {
+        if "url_match" in rule and "value" in rule:
-                "url_match": "SURT_MATCH",
+            new_rule = dict(rule)
-                "value": "http://(com,woop,)/fuh/",
+            url_match = new_rule.pop("url_match")
-            },
+            if url_match == "REGEX_MATCH":
-            {
+                new_rule["regex"] = new_rule.pop("value")
-                "domain": "bad.domain.com",
+            elif url_match == "SURT_MATCH":
-            },
+                new_rule["surt"] = new_rule.pop("value")
-        ]
+            elif url_match == "STRING_MATCH":
                new_rule["substring"] = new_rule.pop("value")
            else:
                raise Exception("invalid scope rule")
            return new_rule
        else:
            return rule
    def _scope_rule_applies(self, rule, url, parent_page=None):
        """
        Examples of valid rules expressed as yaml.
        - domain: bad.domain.com
        # preferred:
        - domain: monkey.org
          substring: bar
        # deprecated version of the same:
        - domain: monkey.org
          url_match: STRING_MATCH
          value: bar
        # preferred:
        - surt: http://(com,woop,)/fuh/
        # deprecated version of the same:
        - url_match: SURT_MATCH
          value: http://(com,woop,)/fuh/
        # preferred:
        - regex: ^https?://(www.)?youtube.com/watch?.*$
          parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
        # deprecated version of the same:
        - url_match: REGEX_MATCH
          value: ^https?://(www.)?youtube.com/watch?.*$
          parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
        """
        if not isinstance(url, Url):
            u = Url(url)
        else:
            u = url
-        if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
+        try:
            rewl = self._normalize_rule(rule)
        except Exception as e:
            self.logger.error(
                    "problem normalizing scope rule %s - %s", rule, e)
            return False
-        if "url_match" in rule:
+
-            if rule["url_match"] == "STRING_MATCH":
+        invalid_keys = rewl.keys() - {
-                return u.url.find(rule["value"]) >= 0
+                "domain", "surt", "substring", "regex", "parent_url_regex"}
-            elif rule["url_match"] == "REGEX_MATCH":
+        if invalid_keys:
-                try:
+            self.logger.error(
-                    return re.fullmatch(rule["value"], u.url)
+                    "invalid keys %s in scope rule %s", invalid_keys, rule)
-                except Exception as e:
+            return False
-                    self.logger.warn(
+
-                            "caught exception matching against regex %s: %s",
+        if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
-                            rule["value"], e)
+            return False
        if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
            return False
        if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
            return False
        if "regex" in rewl:
            try:
                if not re.fullmatch(rewl["regex"], u.url):
                    return False
-            elif rule["url_match"] == "SURT_MATCH":
+            except Exception as e:
-                return u.surt.startswith(rule["value"])
+                self.logger.error(
-            else:
+                        "caught exception matching against regex %s - %s",
-                self.logger.warn("invalid rule.url_match=%s", rule.url_match)
+                        rewl["regex"], e)
                return False
-        else:
+        if "parent_url_regex" in rewl:
-            if "domain" in rule:
+            if not parent_page:
                # we already know that it matches from earlier check
                return True
            else:
                self.logger.warn("unable to make sense of scope rule %s", rule)
                return False
            pu = Url(parent_page.url)
            try:
                if not re.fullmatch(rule["parent_url_regex"], pu.url):
                    return False
            except Exception as e:
                self.logger.error(
                        "caught exception matching against regex %s - %s",
                        rule["parent_url_regex"], e)
                return False
        return True
 class Page(brozzler.BaseDictable):
    def __init__(
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.1b9.dev188',
+        version='1.1b9.dev189',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -2,7 +2,7 @@
 '''
 test_units.py - some unit tests for parts of brozzler amenable to that
-Copyright (C) 2016 Internet Archive
+Copyright (C) 2016-2017 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -26,6 +26,7 @@ import brozzler.chrome
 import socket
 import logging
 import psutil
 import yaml
@pytest.fixture(scope='module')
 def httpd(request):
@ -73,3 +74,53 @@ def test_find_available_port():
    sock.close()
    assert x._find_available_port(9800) == 9800
 def test_scoping():
    test_scope = yaml.load('''
 max_hops: 100
 accepts:
 - url_match: REGEX_MATCH
  value: ^.*/audio_file/.*\.mp3$
 - url_match: SURT_MATCH
  value: http://(com,vimeocdn,
 - url_match: STRING_MATCH
  value: ec-media.soundcloud.com
 - regex: ^https?://twitter\.com.*$
 - substring: facebook.com
 - regex: ^https?://(www.)?youtube.com/watch?.*$
  parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
 blocks:
 - domain: twitter.com
  url_match: REGEX_MATCH
  value: ^.*lang=(?!en).*$
 - bad_thing: bad rule should be ignored
 ''')
    site = brozzler.Site(
            seed='http://example.com/foo/bar?baz=quux#monkey', id=1,
            scope=test_scope)
    page = brozzler.Page(
            url='http://example.com/foo/bar?baz=quux#monkey', site_id=site.id)
    assert site.is_in_scope('http://example.com/foo/bar', page)
    assert not site.is_in_scope('http://example.com/foo/baz', page)
    assert not site.is_in_scope('http://foo.com/some.mp3', page)
    assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
    assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
    assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
    assert site.is_in_scope('https://twitter.com/twit', page)
    assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
    assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
    assert site.is_in_scope('https://www.facebook.com/whatevz', page)
    assert not site.is_in_scope(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
    yt_user_page = brozzler.Page(
            url='https://www.youtube.com/user/SonoraSantaneraVEVO',
            site_id=site.id, hops_from_seed=10)
    assert site.is_in_scope(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)