From 0af00bb3d5cad84bb52f68ffea4fc297c4f23a82 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 3 May 2016 20:52:22 +0000
Subject: [PATCH] support for host rules in outlink scoping

---
 brozzler/frontier.py |   6 +-
 brozzler/site.py     | 155 +++++++++++++++++++++++++++++++------------
 brozzler/worker.py   |   1 +
 setup.py             |   2 +-
 4 files changed, 119 insertions(+), 45 deletions(-)

diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index 76840db..72934a8 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -258,10 +258,10 @@ class RethinkDbFrontier:
     def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
         counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
         for url in outlinks or []:
-            surt_ = brozzler.site.to_surt(url)
-            if site.is_in_scope(url, surt_=surt_, parent_page=parent_page):
+            u = brozzler.site.Url(url)
+            if site.is_in_scope(u, parent_page=parent_page):
                 if brozzler.is_permitted_by_robots(site, url):
-                    if not surt_.startswith(site.scope["surt"]):
+                    if not u.surt.startswith(site.scope["surt"]):
                         hops_off_surt = parent_page.hops_off_surt + 1
                     else:
                         hops_off_surt = 0
diff --git a/brozzler/site.py b/brozzler/site.py
index 74e692b..d774335 100644
--- a/brozzler/site.py
+++ b/brozzler/site.py
@@ -25,8 +25,62 @@ import time
 import rethinkstuff
 import datetime
 import re
+import ipaddress
 
-_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)
+_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
+        tzinfo=rethinkstuff.UTC)
+
+class Url:
+    def __init__(self, url):
+        self.url = url
+        self._surt = None
+        self._host = None
+
+    @property
+    def surt(self):
+        if not self._surt:
+            hurl = surt.handyurl.parse(self.url)
+            surt.GoogleURLCanonicalizer.canonicalize(hurl)
+            hurl.query = None
+            hurl.hash = None
+            # XXX chop off path after last slash??
+            self._surt = hurl.getURLString(surt=True, trailing_comma=True)
+        return self._surt
+
+    @property
+    def host(self):
+        if not self._host:
+            self._host = surt.handyurl.parse(self.url).host
+        return self._host
+
+    def matches_ip_or_domain(self, ip_or_domain):
+        """Returns true if
+           - ip_or_domain is an ip address and self.host is the same ip address
+           - ip_or_domain is a domain and self.host is the same domain
+           - ip_or_domain is a domain and self.host is a subdomain of it
+        """
+        if ip_or_domain == self.host:
+            return True
+
+        # if either ip_or_domain or self.host are ip addresses, and they're not
+        # identical (previous check), not a match
+        try:
+            ipaddress.ip_address(ip_or_domain)
+            return False
+        except:
+            pass
+        try:
+            ipaddress.ip_address(self.host)
+            return False
+        except:
+            pass
+
+        # if we get here, we're looking at two hostnames
+        # XXX do we need to handle case of one punycoded idn, other not?
+        domain_parts = ip_or_domain.split(".")
+        host_parts = self.host.split(".")
+
+        return host_parts[-len(domain_parts):] == domain_parts
 
 class Site(brozzler.BaseDictable):
     logger = logging.getLogger(__module__ + "." + __qualname__)
@@ -58,7 +112,7 @@ class Site(brozzler.BaseDictable):
 
         self.scope = scope or {}
         if not "surt" in self.scope:
-            self.scope["surt"] = self._to_surt(seed)
+            self.scope["surt"] = Url(seed).surt
 
     def __repr__(self):
         return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
@@ -69,72 +123,95 @@ class Site(brozzler.BaseDictable):
     def __str__(self):
         return "Site-%s-%s" % (self.id, self.seed)
 
-    def _to_surt(self, url):
-        hurl = surt.handyurl.parse(url)
-        surt.GoogleURLCanonicalizer.canonicalize(hurl)
-        hurl.query = None
-        hurl.hash = None
-        # XXX chop off path after last slash??
-        return hurl.getURLString(surt=True, trailing_comma=True)
-
     def note_seed_redirect(self, url):
-        new_scope_surt = self._to_surt(url)
+        new_scope_surt = Url(url).surt
         if not new_scope_surt.startswith(self.scope["surt"]):
             self.logger.info("changing site scope surt from {} to {}".format(
                 self.scope["surt"], new_scope_surt))
             self.scope["surt"] = new_scope_surt
 
-    def is_in_scope(self, url, surt_=None, parent_page=None):
-        if not surt_:
-            surt_ = to_surt(url)
-        might_accept = False
+    def is_in_scope(self, url, parent_page=None):
+        if not isinstance(url, Url):
+            u = Url(url)
+        else:
+            u = url
 
-        if not surt_.startswith("http://") and not surt_.startswith("https://"):
+        might_accept = False
+        if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
             # XXX doesn't belong here maybe (where? worker ignores unknown
             # schemes?)
             return False
         elif (parent_page and "max_hops" in self.scope
                 and parent_page.hops_from_seed >= self.scope["max_hops"]):
             pass
-        elif surt_.startswith(self.scope["surt"]):
+        elif u.surt.startswith(self.scope["surt"]):
             might_accept = True
         elif parent_page and parent_page.hops_off_surt < self.scope.get(
                 "max_hops_off_surt", 0):
             might_accept = True
         elif "accepts" in self.scope:
             for rule in self.scope["accepts"]:
-                if self._scope_rule_applies(rule, url, surt_):
+                if self._scope_rule_applies(rule, u):
                     might_accept = True
+                    break
 
         if might_accept:
             if "blocks" in self.scope:
                 for rule in self.scope["blocks"]:
-                    if self._scope_rule_applies(rule, url, surt_):
+                    if self._scope_rule_applies(rule, u):
                         return False
             return True
         else:
             return False
 
-    def _scope_rule_applies(self, rule, url, surt_):
-        if not "url_match" in rule or not "value" in rule:
-            self.logger.warn("unable to make sense of scope rule %s", rule)
-            return False
-        if rule["url_match"] == "STRING_MATCH":
-            return url.find(rule["value"]) >= 0
-        elif rule["url_match"] == "REGEX_MATCH":
-            try:
-                return re.fullmatch(rule["value"], url)
-            except Exception as e:
-                self.logger.warn(
-                        "caught exception matching against regex %s: %s",
-                        rule["value"], e)
-                return False
-        elif rule["url_match"] == "SURT_MATCH":
-            return surt_.startswith(rule["value"])
+    def _scope_rule_applies(self, rule, url):
+        """
+        Examples of valid rules:
+        [
+            {
+                "host": "monkey.org",
+                "url_match": "STRING_MATCH",
+                "value": "bar",
+            },
+            {
+                "url_match": "SURT_MATCH",
+                "value": "+http://(com,woop,)/fuh/",
+            },
+            {
+                "host": "badhost.com",
+            },
+        ]
+        """
+        if not isinstance(url, Url):
+            u = Url(url)
         else:
-            self.logger.warn("invalid rule.url_match=%s", rule.url_match)
-            return False
+            u = url
 
+        if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
+            return False
+        if "url_match" in rule:
+            if rule["url_match"] == "STRING_MATCH":
+                return u.url.find(rule["value"]) >= 0
+            elif rule["url_match"] == "REGEX_MATCH":
+                try:
+                    return re.fullmatch(rule["value"], u.url)
+                except Exception as e:
+                    self.logger.warn(
+                            "caught exception matching against regex %s: %s",
+                            rule["value"], e)
+                    return False
+            elif rule["url_match"] == "SURT_MATCH":
+                return u.surt.startswith(rule["value"])
+            else:
+                self.logger.warn("invalid rule.url_match=%s", rule.url_match)
+                return False
+        else:
+            if "host" in rule:
+                # we already know that it matches from earlier check
+                return True
+            else:
+                self.logger.warn("unable to make sense of scope rule %s", rule)
+                return False
 
 class Page(brozzler.BaseDictable):
     def __init__(
@@ -183,7 +260,3 @@ class Page(brozzler.BaseDictable):
             surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
         return self._canon_hurl.geturl()
 
-def to_surt(url):
-    hurl = surt.handyurl.parse(url)
-    return surt.GoogleURLCanonicalizer.canonicalize(
-            hurl).getURLString(surt=True, trailing_comma=True)
diff --git a/brozzler/worker.py b/brozzler/worker.py
index 8cd7c65..c3ee4b5 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -34,6 +34,7 @@ import socket
 import datetime
 import collections
 import requests
+import rethinkstuff
 
 class ExtraHeaderAdder(urllib.request.BaseHandler):
     def __init__(self, extra_headers):
diff --git a/setup.py b/setup.py
index 6dab8ec..fdf118b 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ import setuptools
 import glob
 
 setuptools.setup(name='brozzler',
-        version='1.1.dev6',
+        version='1.1.dev7',
         description='Distributed web crawling with browsers',
         url='https://github.com/nlevitt/brozzler',
         author='Noah Levitt',