support for extra "blocks" and "accepts" scope rules

2025-08-03 03:56:24 -04:00 · 2016-04-21 22:22:44 +00:00 · 2016-04-21 22:22:44 +00:00 · 2825ffea15
commit 2825ffea15
parent 68abb3cb94
4 changed files with 51 additions and 10 deletions
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -233,8 +233,7 @@ class RethinkDbFrontier:
        counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
        for url in outlinks or []:
            surt_ = brozzler.site.to_surt(url)
-
-            if site.is_in_scope(surt_, parent_page):
+            if site.is_in_scope(url, surt_=surt_, parent_page=parent_page):
                if brozzler.is_permitted_by_robots(site, url):
                    if not surt_.startswith(site.scope["surt"]):
                        hops_off_surt = parent_page.hops_off_surt + 1
@ -258,8 +257,11 @@ class RethinkDbFrontier:
            else:
                counts["rejected"] += 1

-        self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
-            counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
+        self.logger.info(
+                "%s new links added, %s existing links updated, %s links "
+                "rejected, %s links blocked by robots from %s",
+                counts["added"], counts["updated"], counts["rejected"],
+                counts["blocked"], parent_page)

    def reached_limit(self, site, e):
        self.logger.info("reached_limit site=%s e=%s", site, e)
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -6,6 +6,7 @@ import hashlib
 import time
 import rethinkstuff
 import datetime
+import re

 _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)

@ -64,22 +65,58 @@ class Site(brozzler.BaseDictable):
                self.scope["surt"], new_scope_surt))
            self.scope["surt"] = new_scope_surt

-    def is_in_scope(self, surt_, parent_page=None):
+    def is_in_scope(self, url, surt_=None, parent_page=None):
+        if not surt_:
+            surt_ = to_surt(url)
+        might_accept = False
+
        if not surt_.startswith("http://") and not surt_.startswith("https://"):
            # XXX doesn't belong here maybe (where? worker ignores unknown
            # schemes?)
            return False
        elif (parent_page and "max_hops" in self.scope
                and parent_page.hops_from_seed >= self.scope["max_hops"]):
-            return False
+            pass
        elif surt_.startswith(self.scope["surt"]):
-            return True
+            might_accept = True
        elif parent_page and parent_page.hops_off_surt < self.scope.get(
                "max_hops_off_surt", 0):
+            might_accept = True
+        elif "accepts" in self.scope:
+            for rule in self.scope["accepts"]:
+                if self._scope_rule_applies(rule, url, surt_):
+                    might_accept = True
+
+        if might_accept:
+            if "blocks" in self.scope:
+                for rule in self.scope["blocks"]:
+                    if self._scope_rule_applies(rule, url, surt_):
+                        return False
            return True
        else:
            return False

+    def _scope_rule_applies(self, rule, url, surt_):
+        if not "url_match" in rule or not "value" in rule:
+            self.logger.warn("unable to make sense of scope rule %s", rule)
+            return False
+        if rule["url_match"] == "STRING_MATCH":
+            return url.find(rule["value"]) >= 0
+        elif rule["url_match"] == "REGEX_MATCH":
+            try:
+                return re.fullmatch(rule["value"], url)
+            except Exception as e:
+                self.logger.warn(
+                        "caught exception matching against regex %s: %s",
+                        rule["value"], e)
+                return False
+        elif rule["url_match"] == "SURT_MATCH":
+            return surt_.startswith(rule["value"])
+        else:
+            self.logger.warn("invalid rule.url_match=%s", rule.url_match)
+            return False
+
+
 class Page(brozzler.BaseDictable):
    def __init__(
            self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -307,8 +307,10 @@ class BrozzlerWorker:
                        site = self._frontier.claim_site(self._id)
                        self.logger.info("brozzling site %s", site)
                        ydl = self._youtube_dl(site)
-                        th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
-                            name="BrowsingThread-{}".format(site.seed))
+                        th = threading.Thread(
+                                target=lambda: self._brozzle_site(
+                                    browser, ydl, site),
+                                name="BrowsingThread-{}".format(site.seed))
                        th.start()
                    except:
                        self._browser_pool.release(browser)
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,7 @@ import setuptools
 import glob

 setuptools.setup(name='brozzler',
-        version='1.1.dev5',
+        version='1.1.dev6',
        description='Distributed web crawling with browsers',
        url='https://github.com/nlevitt/brozzler',
        author='Noah Levitt',