support for one-hop-off (or n-hop-off) scoping

2025-08-07 05:52:27 -04:00 · 2016-04-21 17:41:30 +00:00 · 2016-04-21 17:41:30 +00:00 · fee008266f
commit fee008266f
parent 7bc726f717
3 changed files with 47 additions and 38 deletions
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -231,26 +231,32 @@ class RethinkDbFrontier:

    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
-        if outlinks:
-            for url in outlinks:
-                if site.is_in_scope(url, parent_page):
-                    if brozzler.is_permitted_by_robots(site, url):
-                        new_child_page = brozzler.Page(
-                                url, site_id=site.id, job_id=site.job_id,
-                                hops_from_seed=parent_page.hops_from_seed+1,
-                                via_page_id=parent_page.id)
-                        existing_child_page = self.page(new_child_page.id)
-                        if existing_child_page:
-                            existing_child_page.priority += new_child_page.priority
-                            self.update_page(existing_child_page)
-                            counts["updated"] += 1
-                        else:
-                            self.new_page(new_child_page)
-                            counts["added"] += 1
+        for url in outlinks or []:
+            surt_ = brozzler.site.to_surt(url)
+
+            if site.is_in_scope(surt_, parent_page):
+                if brozzler.is_permitted_by_robots(site, url):
+                    if not surt_.startswith(site.scope["surt"]):
+                        hops_off_surt = parent_page.hops_off_surt + 1
                    else:
-                        counts["blocked"] += 1
+                        hops_off_surt = 0
+                    new_child_page = brozzler.Page(
+                            url, site_id=site.id, job_id=site.job_id,
+                            hops_from_seed=parent_page.hops_from_seed+1,
+                            via_page_id=parent_page.id,
+                            hops_off_surt=hops_off_surt)
+                    existing_child_page = self.page(new_child_page.id)
+                    if existing_child_page:
+                        existing_child_page.priority += new_child_page.priority
+                        self.update_page(existing_child_page)
+                        counts["updated"] += 1
+                    else:
+                        self.new_page(new_child_page)
+                        counts["added"] += 1
                else:
-                    counts["rejected"] += 1
+                    counts["blocked"] += 1
+            else:
+                counts["rejected"] += 1

        self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
            counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -60,30 +60,27 @@ class Site(brozzler.BaseDictable):
    def note_seed_redirect(self, url):
        new_scope_surt = self._to_surt(url)
        if not new_scope_surt.startswith(self.scope["surt"]):
-            self.logger.info("changing site scope surt from {} to {}".format(self.scope["surt"], new_scope_surt))
+            self.logger.info("changing site scope surt from {} to {}".format(
+                self.scope["surt"], new_scope_surt))
            self.scope["surt"] = new_scope_surt

-    def is_in_scope(self, url, parent_page=None):
-        if parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]:
+    def is_in_scope(self, surt_, parent_page=None):
+        if (parent_page and "max_hops" in self.scope
+                and parent_page.hops_from_seed >= self.scope["max_hops"]):
            return False
-
-        try:
-            hurl = surt.handyurl.parse(url)
-
-            # XXX doesn't belong here probably (where? worker ignores unknown schemes?)
-            if hurl.scheme != "http" and hurl.scheme != "https":
-                return False
-
-            surtt = surt.GoogleURLCanonicalizer.canonicalize(hurl).getURLString(surt=True, trailing_comma=True)
-            return surtt.startswith(self.scope["surt"])
-        except:
-            self.logger.warn("problem parsing url %s", repr(url))
+        elif surt_.startswith(self.scope["surt"]):
+            return True
+        elif parent_page and parent_page.hops_off_surt < self.scope.get(
+                "max_hops_off_surt", 0):
+            return True
+        else:
            return False

 class Page(brozzler.BaseDictable):
-    def __init__(self, url, id=None, site_id=None, job_id=None,
-            hops_from_seed=0, redirect_url=None, priority=None, claimed=False,
-            brozzle_count=0, via_page_id=None, last_claimed_by=None):
+    def __init__(
+            self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
+            redirect_url=None, priority=None, claimed=False, brozzle_count=0,
+            via_page_id=None, last_claimed_by=None, hops_off_surt=0):
        self.site_id = site_id
        self.job_id = job_id
        self.url = url
@ -93,6 +90,7 @@ class Page(brozzler.BaseDictable):
        self.last_claimed_by = last_claimed_by
        self.brozzle_count = brozzle_count
        self.via_page_id = via_page_id
+        self.hops_off_surt = hops_off_surt
        self._canon_hurl = None

        if priority is not None:
@ -103,7 +101,8 @@ class Page(brozzler.BaseDictable):
        if id is not None:
            self.id = id
        else:
-            digest_this = "site_id:{},canon_url:{}".format(self.site_id, self.canon_url())
+            digest_this = "site_id:{},canon_url:{}".format(
+                    self.site_id, self.canon_url())
            self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()

    def __repr__(self):
@ -125,3 +124,7 @@ class Page(brozzler.BaseDictable):
            surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
        return self._canon_hurl.geturl()

+def to_surt(url):
+    hurl = surt.handyurl.parse(url)
+    return surt.GoogleURLCanonicalizer.canonicalize(
+            hurl).getURLString(surt=True, trailing_comma=True)
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,7 @@ import setuptools
 import glob

 setuptools.setup(name='brozzler',
-        version='1.1.dev3',
+        version='1.1.dev4',
        description='Distributed web crawling with browsers',
        url='https://github.com/nlevitt/brozzler',
        author='Noah Levitt',