From b409e49cfaed54d9cfe5643f8eede78fdebd3566 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 15 Feb 2017 16:46:45 -0800
Subject: [PATCH] deprecate current scope rule syntax and create new syntax
 with slightly different semantics (to be documented), and add
 parent_url_regex scope rule; unit test for scoping

---
 brozzler/site.py    | 129 +++++++++++++++++++++++++++++++-------------
 setup.py            |   2 +-
 tests/test_units.py |  53 +++++++++++++++++-
 3 files changed, 145 insertions(+), 39 deletions(-)

diff --git a/brozzler/site.py b/brozzler/site.py
index 17f8983..192a4fe 100644
--- a/brozzler/site.py
+++ b/brozzler/site.py
@@ -201,54 +201,109 @@ class Site(brozzler.BaseDictable):
         else:
             return False
 
-    def _scope_rule_applies(self, rule, url):
+    def _normalize_rule(self, rule):
         """
-        Examples of valid rules:
-        [
-            {
-                "domain": "monkey.org",
-                "url_match": "STRING_MATCH",
-                "value": "bar",
-            },
-            {
-                "url_match": "SURT_MATCH",
-                "value": "http://(com,woop,)/fuh/",
-            },
-            {
-                "domain": "bad.domain.com",
-            },
-        ]
+        Normalizes a scope rule.
+
+        A scope rule is considered deprecated if it contains a `url_match` and
+        `value`. This method converts such scope rules to the preferred style
+        and returns the new rule. If `rule` is not a deprecated-style rule,
+        returns  it unchanged.
+        """
+        if "url_match" in rule and "value" in rule:
+            new_rule = dict(rule)
+            url_match = new_rule.pop("url_match")
+            if url_match == "REGEX_MATCH":
+                new_rule["regex"] = new_rule.pop("value")
+            elif url_match == "SURT_MATCH":
+                new_rule["surt"] = new_rule.pop("value")
+            elif url_match == "STRING_MATCH":
+                new_rule["substring"] = new_rule.pop("value")
+            else:
+                raise Exception("invalid scope rule")
+            return new_rule
+        else:
+            return rule
+
+    def _scope_rule_applies(self, rule, url, parent_page=None):
+        """
+        Examples of valid rules expressed as yaml.
+
+        - domain: bad.domain.com
+
+        # preferred:
+        - domain: monkey.org
+          substring: bar
+
+        # deprecated version of the same:
+        - domain: monkey.org
+          url_match: STRING_MATCH
+          value: bar
+
+        # preferred:
+        - surt: http://(com,woop,)/fuh/
+
+        # deprecated version of the same:
+        - url_match: SURT_MATCH
+          value: http://(com,woop,)/fuh/
+
+        # preferred:
+        - regex: ^https?://(www.)?youtube.com/watch?.*$
+          parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
+
+        # deprecated version of the same:
+        - url_match: REGEX_MATCH
+          value: ^https?://(www.)?youtube.com/watch?.*$
+          parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
         """
         if not isinstance(url, Url):
             u = Url(url)
         else:
             u = url
 
-        if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
+        try:
+            rewl = self._normalize_rule(rule)
+        except Exception as e:
+            self.logger.error(
+                    "problem normalizing scope rule %s - %s", rule, e)
             return False
-        if "url_match" in rule:
-            if rule["url_match"] == "STRING_MATCH":
-                return u.url.find(rule["value"]) >= 0
-            elif rule["url_match"] == "REGEX_MATCH":
-                try:
-                    return re.fullmatch(rule["value"], u.url)
-                except Exception as e:
-                    self.logger.warn(
-                            "caught exception matching against regex %s: %s",
-                            rule["value"], e)
+
+        invalid_keys = rewl.keys() - {
+                "domain", "surt", "substring", "regex", "parent_url_regex"}
+        if invalid_keys:
+            self.logger.error(
+                    "invalid keys %s in scope rule %s", invalid_keys, rule)
+            return False
+
+        if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
+            return False
+        if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
+            return False
+        if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
+            return False
+        if "regex" in rewl:
+            try:
+                if not re.fullmatch(rewl["regex"], u.url):
                     return False
-            elif rule["url_match"] == "SURT_MATCH":
-                return u.surt.startswith(rule["value"])
-            else:
-                self.logger.warn("invalid rule.url_match=%s", rule.url_match)
+            except Exception as e:
+                self.logger.error(
+                        "caught exception matching against regex %s - %s",
+                        rewl["regex"], e)
                 return False
-        else:
-            if "domain" in rule:
-                # we already know that it matches from earlier check
-                return True
-            else:
-                self.logger.warn("unable to make sense of scope rule %s", rule)
+        if "parent_url_regex" in rewl:
+            if not parent_page:
                 return False
+            pu = Url(parent_page.url)
+            try:
+                if not re.fullmatch(rule["parent_url_regex"], pu.url):
+                    return False
+            except Exception as e:
+                self.logger.error(
+                        "caught exception matching against regex %s - %s",
+                        rule["parent_url_regex"], e)
+                return False
+
+        return True
 
 class Page(brozzler.BaseDictable):
     def __init__(
diff --git a/setup.py b/setup.py
index f8e7dbc..9c0d2b5 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
 
 setuptools.setup(
         name='brozzler',
-        version='1.1b9.dev188',
+        version='1.1b9.dev189',
         description='Distributed web crawling with browsers',
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',
diff --git a/tests/test_units.py b/tests/test_units.py
index ca47eb0..fc24a99 100644
--- a/tests/test_units.py
+++ b/tests/test_units.py
@@ -2,7 +2,7 @@
 '''
 test_units.py - some unit tests for parts of brozzler amenable to that
 
-Copyright (C) 2016 Internet Archive
+Copyright (C) 2016-2017 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@ import brozzler.chrome
 import socket
 import logging
 import psutil
+import yaml
 
 @pytest.fixture(scope='module')
 def httpd(request):
@@ -73,3 +74,53 @@ def test_find_available_port():
     sock.close()
     assert x._find_available_port(9800) == 9800
 
+def test_scoping():
+    test_scope = yaml.load('''
+max_hops: 100
+accepts:
+- url_match: REGEX_MATCH
+  value: ^.*/audio_file/.*\.mp3$
+- url_match: SURT_MATCH
+  value: http://(com,vimeocdn,
+- url_match: STRING_MATCH
+  value: ec-media.soundcloud.com
+- regex: ^https?://twitter\.com.*$
+- substring: facebook.com
+- regex: ^https?://(www.)?youtube.com/watch?.*$
+  parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
+blocks:
+- domain: twitter.com
+  url_match: REGEX_MATCH
+  value: ^.*lang=(?!en).*$
+- bad_thing: bad rule should be ignored
+''')
+
+    site = brozzler.Site(
+            seed='http://example.com/foo/bar?baz=quux#monkey', id=1,
+            scope=test_scope)
+    page = brozzler.Page(
+            url='http://example.com/foo/bar?baz=quux#monkey', site_id=site.id)
+
+    assert site.is_in_scope('http://example.com/foo/bar', page)
+    assert not site.is_in_scope('http://example.com/foo/baz', page)
+
+    assert not site.is_in_scope('http://foo.com/some.mp3', page)
+    assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
+
+    assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
+    assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
+
+    assert site.is_in_scope('https://twitter.com/twit', page)
+    assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
+    assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
+
+    assert site.is_in_scope('https://www.facebook.com/whatevz', page)
+
+    assert not site.is_in_scope(
+            'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
+    yt_user_page = brozzler.Page(
+            url='https://www.youtube.com/user/SonoraSantaneraVEVO',
+            site_id=site.id, hops_from_seed=10)
+    assert site.is_in_scope(
+            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
+