diff --git a/.travis.yml b/.travis.yml
index d8fd44f..5cb807b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,7 @@ before_install:
 - sudo pip install ansible==2.1.3.0
 install:
 - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
-- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest
+- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest
 - chromium-browser --version
 - sudo apt-get update
 - sudo apt-get install --only-upgrade chromium-browser
diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index a0f8ab4..2e076d3 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -291,75 +291,80 @@ class RethinkDbFrontier:
                 {"start":doublethink.utcnow(), "stop":None})
         site.save()
 
+    def _build_fresh_page(self, site, parent_page, url, hops_off=0):
+        url_for_scoping = urlcanon.semantic(url)
+        url_for_crawling = urlcanon.whatwg(url)
+        hashtag = (url_for_crawling.hash_sign
+                   + url_for_crawling.fragment).decode('utf-8')
+        urlcanon.canon.remove_fragment(url_for_crawling)
+        page = brozzler.Page(self.rr, {
+            'url': str(url_for_crawling),
+            'site_id': site.id,
+            'job_id': site.job_id,
+            'hops_from_seed': parent_page.hops_from_seed + 1,
+            'via_page_id': parent_page.id,
+            'hops_off_surt': hops_off,
+            'hashtags': [hashtag] if hashtag else []})
+        return page
+
+    def _merge_page(self, existing_page, fresh_page):
+        '''
+        Utility method for merging info from `brozzler.Page` instances
+        representing the same url but with possibly different metadata.
+        '''
+        existing_page.priority += fresh_page.priority
+        existing_page.hashtags = list(set(
+            existing_page.hashtags + fresh_page.hashtags))
+        existing_page.hops_off = min(
+                existing_page.hops_off, fresh_page.hops_off)
+
     def _scope_and_enforce_robots(self, site, parent_page, outlinks):
         '''
         Returns tuple (
-            set of in scope urls (uncanonicalized) accepted by robots policy,
+            dict of {page_id: Page} of fresh `brozzler.Page` representing in
+                scope links accepted by robots policy,
             set of in scope urls (canonicalized) blocked by robots policy,
             set of out-of-scope urls (canonicalized)).
         '''
-        in_scope = set()
+        pages = {}  # {page_id: Page, ...}
         blocked = set()
         out_of_scope = set()
         for url in outlinks or []:
             url_for_scoping = urlcanon.semantic(url)
             url_for_crawling = urlcanon.whatwg(url)
-            urlcanon.canon.remove_fragment(url_for_crawling)
-            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
+            decision = site.accept_reject_or_neither(
+                    url_for_scoping, parent_page=parent_page)
+            if decision is True:
+                hops_off = 0
+            elif decision is None:
+                decision = parent_page.hops_off < site.scope.get(
+                        'max_hops_off', 0)
+                hops_off = parent_page.hops_off + 1
+            if decision is True:
                 if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
-                    in_scope.add(url)
+                    fresh_page = self._build_fresh_page(
+                            site, parent_page, url, hops_off)
+                    if fresh_page.id in pages:
+                        self._merge_page(pages[fresh_page.id], fresh_page)
+                    else:
+                        pages[fresh_page.id] = fresh_page
                 else:
                     blocked.add(str(url_for_crawling))
             else:
                 out_of_scope.add(str(url_for_crawling))
-        return in_scope, blocked, out_of_scope
-
-    def _build_fresh_pages(self, site, parent_page, urls):
-        '''
-        Returns a dict of page_id => brozzler.Page.
-        '''
-        pages = {}
-        for url in urls:
-            url_for_scoping = urlcanon.semantic(url)
-            url_for_crawling = urlcanon.whatwg(url)
-            hashtag = (url_for_crawling.hash_sign
-                       + url_for_crawling.fragment).decode('utf-8')
-            urlcanon.canon.remove_fragment(url_for_crawling)
-            if not url_for_scoping.surt().startswith(
-                    site.scope['surt'].encode('utf-8')):
-                hops_off_surt = parent_page.hops_off_surt + 1
-            else:
-                hops_off_surt = 0
-            page = brozzler.Page(self.rr, {
-                'url': str(url_for_crawling),
-                'site_id': site.id,
-                'job_id': site.job_id,
-                'hops_from_seed': parent_page.hops_from_seed + 1,
-                'via_page_id': parent_page.id,
-                'hops_off_surt': hops_off_surt,
-                'hashtags': []})
-            if page.id in pages:
-                pages[page.id].priority += page.priority
-                page = pages[page.id]
-            else:
-                pages[page.id] = page
-            if hashtag:
-                page.hashtags = list(set(page.hashtags + [hashtag]))
-        return pages
+        return pages, blocked, out_of_scope
 
     def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
         decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
         counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
 
-        in_scope, blocked, out_of_scope = self._scope_and_enforce_robots(
+        fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
                 site, parent_page, outlinks)
         decisions['blocked'] = blocked
         decisions['rejected'] = out_of_scope
         counts['blocked'] += len(blocked)
         counts['rejected'] += len(out_of_scope)
 
-        fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)
-
         # get existing pages from rethinkdb
         results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
         pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml
index 14445bc..6069de8 100644
--- a/brozzler/job_schema.yaml
+++ b/brozzler/job_schema.yaml
@@ -65,7 +65,7 @@ id:
       max_hops:
         type: integer
 
-      max_hops_off_surt:
+      max_hops_off:
         type: integer
 
   metadata:
diff --git a/brozzler/model.py b/brozzler/model.py
index 9c1a60f..f3c8679 100644
--- a/brozzler/model.py
+++ b/brozzler/model.py
@@ -99,7 +99,7 @@ def new_job(frontier, job_conf):
 
 def new_site(frontier, site):
     site.id = str(uuid.uuid4())
-    logging.info("new site {}".format(site))
+    logging.info("new site %s", site)
     # insert the Page into the database before the Site, to avoid situation
     # where a brozzler worker immediately claims the site, finds no pages
     # to crawl, and decides the site is finished
@@ -183,9 +183,24 @@ class Site(doublethink.Document, ElapsedMixIn):
             self.last_claimed = brozzler.EPOCH_UTC
         if not "scope" in self:
             self.scope = {}
-        if not "surt" in self.scope and self.seed:
-            self.scope["surt"] = brozzler.site_surt_canon(
-                    self.seed).surt().decode('ascii')
+
+        # backward compatibility
+        if "surt" in self.scope:
+            if not "accepts" in self.scope:
+                self.scope["accepts"] = []
+            self.scope["accepts"].append({"surt": self.scope["surt"]})
+            del self.scope["surt"]
+
+        # backward compatibility
+        if ("max_hops_off_surt" in self.scope
+                and not "max_hops_off" in self.scope):
+            self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
+        if "max_hops_off_surt" in self.scope:
+            del self.scope["max_hops_off_surt"]
+
+        if self.seed:
+            self._accept_ssurt_if_not_redundant(
+                    brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
 
         if not "starts_and_stops" in self:
             if self.get("start_time"):   # backward compatibility
@@ -201,12 +216,20 @@ class Site(doublethink.Document, ElapsedMixIn):
     def __str__(self):
         return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
 
+    def _accept_ssurt_if_not_redundant(self, ssurt):
+        if not "accepts" in self.scope:
+            self.scope["accepts"] = []
+        simple_rule_ssurts = (
+            rule["ssurt"] for rule in self.scope["accepts"]
+            if set(rule.keys()) == {'ssurt'})
+        if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
+            self.logger.info(
+                    "adding ssurt %s to scope accept rules", ssurt)
+            self.scope["accepts"].append({"ssurt": ssurt})
+
     def note_seed_redirect(self, url):
-        new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
-        if not new_scope_surt.startswith(self.scope["surt"]):
-            self.logger.info("changing site scope surt from {} to {}".format(
-                self.scope["surt"], new_scope_surt))
-            self.scope["surt"] = new_scope_surt
+        self._accept_ssurt_if_not_redundant(
+                brozzler.site_surt_canon(url).ssurt().decode('ascii'))
 
     def extra_headers(self):
         hdrs = {}
@@ -215,9 +238,20 @@ class Site(doublethink.Document, ElapsedMixIn):
                     self.warcprox_meta, separators=(',', ':'))
         return hdrs
 
-    def is_in_scope(self, url, parent_page=None):
+    def accept_reject_or_neither(self, url, parent_page=None):
+        '''
+        Returns `True` (accepted), `False` (rejected), or `None` (no decision).
+
+        `None` usually means rejected, unless `max_hops_off` comes into play.
+        '''
         if not isinstance(url, urlcanon.ParsedUrl):
             url = urlcanon.semantic(url)
+
+        if not url.scheme in (b'http', b'https'):
+            # XXX doesn't belong here maybe (where? worker ignores unknown
+            # schemes?)
+            return False
+
         try_parent_urls = []
         if parent_page:
             try_parent_urls.append(urlcanon.semantic(parent_page.url))
@@ -225,44 +259,36 @@ class Site(doublethink.Document, ElapsedMixIn):
                 try_parent_urls.append(
                         urlcanon.semantic(parent_page.redirect_url))
 
-        might_accept = False
-        if not url.scheme in (b'http', b'https'):
-            # XXX doesn't belong here maybe (where? worker ignores unknown
-            # schemes?)
-            return False
-        elif (parent_page and "max_hops" in self.scope
+        # enforce max_hops
+        if (parent_page and "max_hops" in self.scope
                 and parent_page.hops_from_seed >= self.scope["max_hops"]):
-            pass
-        elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
-            might_accept = True
-        elif parent_page and parent_page.hops_off_surt < self.scope.get(
-                "max_hops_off_surt", 0):
-            might_accept = True
-        elif "accepts" in self.scope:
-            for accept_rule in self.scope["accepts"]:
-                rule = urlcanon.MatchRule(**accept_rule)
+            return False
+
+        # enforce reject rules
+        if "blocks" in self.scope:
+            for block_rule in self.scope["blocks"]:
+                rule = urlcanon.MatchRule(**block_rule)
                 if try_parent_urls:
                     for parent_url in try_parent_urls:
                         if rule.applies(url, parent_url):
-                           might_accept = True
+                           return False
                 else:
                     if rule.applies(url):
-                        might_accept = True
+                        return False
 
-        if might_accept:
-            if "blocks" in self.scope:
-                for block_rule in self.scope["blocks"]:
-                    rule = urlcanon.MatchRule(**block_rule)
-                    if try_parent_urls:
-                        for parent_url in try_parent_urls:
-                            if rule.applies(url, parent_url):
-                               return False
-                    else:
-                        if rule.applies(url):
-                            return False
-            return True
-        else:
-            return False
+        # honor accept rules
+        for accept_rule in self.scope["accepts"]:
+            rule = urlcanon.MatchRule(**accept_rule)
+            if try_parent_urls:
+                for parent_url in try_parent_urls:
+                    if rule.applies(url, parent_url):
+                       return True
+            else:
+                if rule.applies(url):
+                    return True
+
+        # no decision if we reach here
+        return None
 
 class Page(doublethink.Document):
     logger = logging.getLogger(__module__ + "." + __qualname__)
@@ -280,8 +306,12 @@ class Page(doublethink.Document):
             self.brozzle_count = 0
         if not "claimed" in self:
             self.claimed = False
-        if not "hops_off_surt" in self:
-            self.hops_off_surt = 0
+        if "hops_off_surt" in self and not "hops_off" in self:
+            self.hops_off = self.hops_off_surt
+        if "hops_off_surt" in self:
+            del self["hops_off_surt"]
+        if not "hops_off" in self:
+            self.hops_off = 0
         if not "needs_robots_check" in self:
             self.needs_robots_check = False
         if not "priority" in self:
diff --git a/job-conf.rst b/job-conf.rst
index 1174f1a..1fa5bc6 100644
--- a/job-conf.rst
+++ b/job-conf.rst
@@ -1,17 +1,19 @@
-brozzler job configuration
+Brozzler Job Configuration
 **************************
 
-Jobs are defined using yaml files. Options may be specified either at the
-top-level or on individual seeds. At least one seed url must be specified,
+Jobs are defined using yaml files. At least one seed url must be specified,
 everything else is optional.
 
-an example
-==========
+.. contents::
+
+Example
+=======
 
 ::
 
     id: myjob
     time_limit: 60 # seconds
+    proxy: 127.0.0.1:8000 # point at warcprox for archiving
     ignore_robots: false
     max_claimed_sites: 2
     warcprox_meta:
@@ -35,15 +37,14 @@ an example
       scope:
         surt: http://(org,example,
 
-how inheritance works
+How inheritance works
 =====================
 
-Most of the available options apply to seeds. Such options can also be
-specified at the top level, in which case the seeds inherit the options. If
-an option is specified both at the top level and at the level of an individual
-seed, the results are merged with the seed-level value taking precedence in
-case of conflicts. It's probably easiest to make sense of this by way of an
-example.
+Most of the settings that apply to seeds can also be specified at the top
+level, in which case all seeds inherit those settings. If an option is
+specified both at the top level and at seed level, the results are merged with
+the seed-level value taking precedence in case of conflicts. It's probably
+easiest to make sense of this by way of an example.
 
 In the example yaml above, ``warcprox_meta`` is specified at the top level and
 at the seed level for the seed http://one.example.org/. At the top level we
@@ -79,101 +80,150 @@ Notice that:
 - Since ``buckets`` is a list, the merged result includes all the values from
   both the top level and the seed level.
 
-settings reference
-==================
+Settings
+========
+
+Top-level settings
+------------------
 
 ``id``
-------
-+-----------+--------+----------+--------------------------+
-| scope     | type   | required | default                  |
-+===========+========+==========+==========================+
-| top-level | string | no       | *generated by rethinkdb* |
-+-----------+--------+----------+--------------------------+
+~~~~~~
++--------+----------+--------------------------+
+| type   | required | default                  |
++========+==========+==========================+
+| string | no       | *generated by rethinkdb* |
++--------+----------+--------------------------+
 An arbitrary identifier for this job. Must be unique across this deployment of
 brozzler.
 
-``seeds``
----------
-+-----------+------------------------+----------+---------+
-| scope     | type                   | required | default |
-+===========+========================+==========+=========+
-| top-level | list (of dictionaries) | yes      | *n/a*   |
-+-----------+------------------------+----------+---------+
-List of seeds. Each item in the list is a dictionary (associative array) which
-defines the seed. It must specify ``url`` (see below) and can additionally
-specify any of the settings of scope *seed-level*.
-
 ``max_claimed_sites``
----------------------
-+-----------+--------+----------+---------+
-| scope     | type   | required | default |
-+===========+========+==========+=========+
-| top-level | number | no       | *none*  |
-+-----------+--------+----------+---------+
+~~~~~~~~~~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| number | no       | *none*  |
++--------+----------+---------+
 Puts a cap on the number of sites belonging to a given job that can be brozzled
 simultaneously across the cluster. Addresses the problem of a job with many
 seeds starving out other jobs.
 
+``seeds``
+~~~~~~~~~
++------------------------+----------+---------+
+| type                   | required | default |
++========================+==========+=========+
+| list (of dictionaries) | yes      | *n/a*   |
++------------------------+----------+---------+
+List of seeds. Each item in the list is a dictionary (associative array) which
+defines the seed. It must specify ``url`` (see below) and can additionally
+specify any seed settings.
+
+Seed-level-only settings
+------------------------
+These settings can be specified only at the seed level, unlike most seed
+settings, which can also be specified at the top level.
+
 ``url``
--------
-+------------+--------+----------+---------+
-| scope      | type   | required | default |
-+============+========+==========+=========+
-| seed-level | string | yes      | *n/a*   |
-+------------+--------+----------+---------+
-The seed url.
+~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | yes      | *n/a*   |
++--------+----------+---------+
+The seed url. Crawling starts here.
+
+``username``
+~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+If set, used to populate automatically detected login forms. See explanation at
+"password" below.
+
+``password``
+~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+If set, used to populate automatically detected login forms. If ``username``
+and ``password`` are configured for a seed, brozzler will look for a login form
+on each page it crawls for that seed. A form that has a single text or email
+field (the username), a single password field (``<input type="password">``),
+and has ``method="POST"`` is considered to be a login form. The form may have
+other fields like checkboxes and hidden fields. For these, brozzler will leave
+the default values in place. Brozzler submits login forms after page load.
+Then brozzling proceeds as usual.
+
+Seed-level / top-level settings
+-------------------------------
+These are seed settings that can also be speficied at the top level, in which
+case they are inherited by all seeds.
 
 ``metadata``
-------------
-+-----------------------+------------+----------+---------+
-| scope                 | type       | required | default |
-+=======================+============+==========+=========+
-| seed-level, top-level | dictionary | no      | *none*   |
-+-----------------------+------------+----------+---------+
+~~~~~~~~~~~~
++------------+----------+---------+
+| type       | required | default |
++============+==========+=========+
+| dictionary | no       | *none*  |
++------------+----------+---------+
 Arbitrary information about the crawl job or site. Merely informative, not used
 by brozzler for anything. Could be of use to some external process.
 
 ``time_limit``
---------------
-+-----------------------+--------+----------+---------+
-| scope                 | type   | required | default |
-+=======================+========+==========+=========+
-| seed-level, top-level | number | no       | *none*  |
-+-----------------------+--------+----------+---------+
-Time limit in seconds. If not specified, there no time limit. Time limit is
+~~~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| number | no       | *none*  |
++--------+----------+---------+
+Time limit in seconds. If not specified, there is no time limit. Time limit is
 enforced at the seed level. If a time limit is specified at the top level, it
 is inherited by each seed as described above, and enforced individually on each
 seed.
 
+``proxy``
+~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+HTTP proxy, with the format ``host:port``. Typically configured to point to
+warcprox for archival crawling.
+
 ``ignore_robots``
------------------
-+-----------------------+---------+----------+-----------+
-| scope                 | type    | required | default   |
-+=======================+=========+==========+===========+
-| seed-level, top-level | boolean | no       | ``false`` |
-+-----------------------+---------+----------+-----------+
+~~~~~~~~~~~~~~~~~
++---------+----------+-----------+
+| type    | required | default   |
++=========+==========+===========+
+| boolean | no       | ``false`` |
++---------+----------+-----------+
 If set to ``true``, brozzler will happily crawl pages that would otherwise be
 blocked by robots.txt rules.
 
 ``user_agent``
---------------
-+-----------------------+---------+----------+---------+
-| scope                 | type    | required | default |
-+=======================+=========+==========+=========+
-| seed-level, top-level | string  | no       | *none*  |
-+-----------------------+---------+----------+---------+
+~~~~~~~~~~~~~~
++---------+----------+---------+
+| type    | required | default |
++=========+==========+=========+
+| string  | no       | *none*  |
++---------+----------+---------+
 The ``User-Agent`` header brozzler will send to identify itself to web servers.
 It's good ettiquette to include a project URL with a notice to webmasters that
 explains why you're crawling, how to block the crawler robots.txt and how to
 contact the operator if the crawl is causing problems.
 
 ``warcprox_meta``
------------------
-+-----------------------+------------+----------+-----------+
-| scope                 | type       | required | default   |
-+=======================+============+==========+===========+
-| seed-level, top-level | dictionary | no       | ``false`` |
-+-----------------------+------------+----------+-----------+
+~~~~~~~~~~~~~~~~~
++------------+----------+-----------+
+| type       | required | default   |
++============+==========+===========+
+| dictionary | no       | ``false`` |
++------------+----------+-----------+
 Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
 configured. The value of the Warcprox-Meta header is a json blob. It is used to
 pass settings and information to warcprox. Warcprox does not forward the header
@@ -195,36 +245,217 @@ becomes::
     Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
 
 ``scope``
----------
-+-----------------------+------------+----------+-----------+
-| scope                 | type       | required | default   |
-+=======================+============+==========+===========+
-| seed-level, top-level | dictionary | no       | ``false`` |
-+-----------------------+------------+----------+-----------+
-Scope rules. *TODO*
+~~~~~~~~~
++------------+----------+-----------+
+| type       | required | default   |
++============+==========+===========+
+| dictionary | no       | ``false`` |
++------------+----------+-----------+
+Scope specificaion for the seed. See the "Scoping" section which follows.
 
-``surt``
---------
-+-------------+--------+----------+---------------------------+
-| scope       | type   | required | default                   |
-+=============+========+==========+===========================+
-| scope-level | string | no       | *generated from seed url* |
-+-------------+--------+----------+---------------------------+
+Scoping
+=======
+
+The scope of a seed determines which links are scheduled for crawling and which
+are not. Example::
+
+    scope:
+      accepts:
+      - ssurt: com,example,//https:/
+      - parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$
+        regex: ^https?://(www\.)?youtube.com/watch\?.*$
+      - surt: http://(com,google,video,
+      - surt: http://(com,googlevideo,
+      blocks:
+      - domain: youngscholars.unimelb.edu.au
+        substring: wp-login.php?action=logout
+      - domain: malware.us
+      max_hops: 20
+      max_hops_off: 0
+
+Toward the end of the process of brozzling a page, brozzler obtains a list of
+navigational links (``<a href="...">`` and similar) on the page, and evaluates
+each link to determine whether it is in scope or out of scope for the crawl.
+Then, newly discovered links that are in scope are scheduled to be crawled, and
+previously discovered links get a priority bump.
+
+How brozzler applies scope rules
+--------------------------------
+
+Each scope rule has one or more conditions. If all of the conditions match,
+then the scope rule as a whole matches. For example::
+
+    - domain: youngscholars.unimelb.edu.au
+      substring: wp-login.php?action=logout
+
+This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or
+a subdomain, and the string "wp-login.php?action=logout" is found somewhere in
+the url.
+
+Brozzler applies these logical steps to decide whether a url is in or out of
+scope:
+
+1. If the number of hops from seed is greater than ``max_hops``, the url is
+   **out of scope**.
+2. Otherwise, if any ``block`` rule matches, the url is **out of scope**.
+3. Otherwise, if any ``accept`` rule matches, the url is **in scope**.
+4. Otherwise, if the url is at most ``max_hops_off`` hops from the last page
+   that was in scope thanks to an ``accept`` rule, the url is **in scope**.
+5. Otherwise (no rules match), the url is **out of scope**.
+
+Notably, ``block`` rules take precedence over ``accept`` rules.
+
+It may also be helpful to think about a list of scope rules as a boolean
+expression. For example::
+
+    blocks:
+    - domain: youngscholars.unimelb.edu.au
+      substring: wp-login.php?action=logout
+    - domain: malware.us
+
+means block the url IF::
+
+    ("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us"
+
+Automatic scoping based on seed urls
+------------------------------------
+Brozzler usually generates an ``accept`` scope rule based on the seed url. It
+does this to fulfill the usual expectation that everything "under" the seed
+will be crawled.
+
+To generate the rule, brozzler canonicalizes the seed url using the `urlcanon
+<https://github.com/iipc/urlcanon>`_ library's "semantic" canonicalizer, then
+removes the query string if any, and finally serializes the result in SSURT
+[1]_ form. For example, a seed url of
+``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
+``com,example,www,//https:/foo/bar?a=b&c=d``.
+
+If the url in the browser location bar at the end of brozzling the seed page
+differs from the seed url, brozzler automatically adds a second ``accept`` rule
+to ensure the site is in scope, as if the new url were the original seed url.
+It does this so that, for example, if ``http://example.com/`` redirects to
+``http://www.example.com/``, the rest of the ``www.example.com`` is in scope.
+
+Brozzler derives its general approach to the seed surt from Heritrix, but
+differs in a few respects.
+
+1. Unlike heritrix, brozzler does not strip the path segment after the last
+   slash.
+2. Canonicalization does not attempt to match heritrix exactly, though it
+   usually does match.
+3. When generating a surt for an https url, heritrix changes the scheme to
+   http. For example, the heritrix surt for ``https://www.example.com/`` is
+   ``http://(com,example,www,)`` and this means that all of
+   ``http://www.example.com/*`` and ``https://www.example.com/*`` are in
+   scope. It also means that a manually specified surt with scheme "https" does
+   not match anything. Brozzler does no scheme munging.
+4. Brozzler identifies seed "redirects" by retrieving the url from the
+   browser's location bar at the end of brozzling the seed page, whereas
+   heritrix follows http 3xx redirects.
+5. Brozzler uses ssurt instead of surt.
+6. There is currently no brozzler option to disable the automatically generated
+   ``accept`` rules.
+
+Scope settings
+--------------
 
 ``accepts``
------------
-+-------------+------+----------+---------+
-| scope       | type | required | default |
-+=============+======+==========+=========+
-| scope-level | list | no       | *none*  |
-+-------------+------+----------+---------+
+~~~~~~~~~~~
++------+----------+---------+
+| type | required | default |
++======+==========+=========+
+| list | no       | *none*  |
++------+----------+---------+
+List of scope rules. If any of the rules match, and the url is within
+``max_hops`` from seed, and none of the ``block`` rules apply, the url is in
+scope.
 
 ``blocks``
------------
-+-------------+------+----------+---------+
-| scope       | type | required | default |
-+=============+======+==========+=========+
-| scope-level | list | no       | *none*  |
-+-------------+------+----------+---------+
+~~~~~~~~~~~
++------+----------+---------+
+| type | required | default |
++======+==========+=========+
+| list | no       | *none*  |
++------+----------+---------+
+List of scope rules. If any of the rules match, the url is deemed out of scope.
 
+``max_hops``
+~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| number | no       | *none*  |
++--------+----------+---------+
+Maximum number of hops from seed.
 
+``max_hops_off``
+~~~~~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| number | no       | 0       |
++--------+----------+---------+
+Expands the scope to include urls up to this many hops from the last page that
+was in scope thanks to an ``accept`` rule.
+
+Scope rule conditions
+---------------------
+
+``domain``
+~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+Matches if the host part of the canonicalized url is ``domain`` or a
+subdomain.
+
+``substring``
+~~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+Matches if ``substring`` is found anywhere in the canonicalized url.
+
+``regex``
+~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+Matches if the full canonicalized url matches ``regex``.
+
+``ssurt``
+~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``.
+
+``surt``
+~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
+
+``parent_url_regex``
+~~~~~~~~~~~~~~~~~~~~
++--------+----------+---------+
+| type   | required | default |
++========+==========+=========+
+| string | no       | *none*  |
++--------+----------+---------+
+Matches if the full canonicalized parent url matches ``regex``. The parent url
+is the url of the page in which the link was found.
+
+.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
+.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
diff --git a/setup.py b/setup.py
index 289d224..e7f7da3 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 '''
 setup.py - brozzler setup script
 
-Copyright (C) 2014-2017 Internet Archive
+Copyright (C) 2014-2018 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -69,8 +69,8 @@ setuptools.setup(
             'requests',
             'websocket-client!=0.39.0',
             'pillow==3.3.0',
-            'urlcanon>=0.1.dev16',
-            'doublethink>=0.2.0.dev81',
+            'urlcanon>=0.1.dev23',
+            'doublethink>=0.2.0.dev88',
             'rethinkdb>=2.3,<2.4',
             'cerberus==1.0.1',
             'jinja2',
@@ -79,7 +79,7 @@ setuptools.setup(
         extras_require={
             'dashboard': ['flask>=0.11', 'gunicorn'],
             'easy': [
-                'warcprox>=2.4b1.dev145',
+                'warcprox>=2.4b2.dev173',
                 'pywb<2',
                 'flask>=0.11',
                 'gunicorn'
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 2559e07..0ec5026 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -448,13 +448,13 @@ def test_login(httpd):
     assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
 
 def test_seed_redirect(httpd):
-    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
+    test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
     rr = doublethink.Rethinker('localhost', db='brozzler')
     seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
     site = brozzler.Site(rr, {
         'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
         'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
-    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
+    assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}
 
     frontier = brozzler.RethinkDbFrontier(rr)
     brozzler.new_site(frontier, site)
@@ -478,7 +478,9 @@ def test_seed_redirect(httpd):
     assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
 
     # check that scope has been updated properly
-    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
+    assert site.scope == {'accepts': [
+        {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
+        {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
 
 def test_hashtags(httpd):
     test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
diff --git a/tests/test_frontier.py b/tests/test_frontier.py
index 9b80a4f..d66773e 100644
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@@ -73,9 +73,7 @@ def test_basics():
         'job_id': job.id,
         'last_claimed': brozzler.EPOCH_UTC,
         'last_disclaimed': brozzler.EPOCH_UTC,
-        'scope': {
-            'surt': 'http://(com,example,)/'
-        },
+        'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]},
         'seed': 'http://example.com',
         'starts_and_stops': [
             {
@@ -91,9 +89,7 @@ def test_basics():
         'job_id': job.id,
         'last_claimed': brozzler.EPOCH_UTC,
         'last_disclaimed': brozzler.EPOCH_UTC,
-        'scope': {
-            'surt': 'https://(org,example,)/',
-        },
+        'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]},
         'seed': 'https://example.org/',
         'starts_and_stops': [
             {
@@ -110,7 +106,7 @@ def test_basics():
         'brozzle_count': 0,
         'claimed': False,
         'hops_from_seed': 0,
-        'hops_off_surt': 0,
+        'hops_off': 0,
         'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
         'job_id': job.id,
         'needs_robots_check': True,
@@ -124,7 +120,7 @@ def test_basics():
         'brozzle_count': 0,
         'claimed': False,
         'hops_from_seed': 0,
-        'hops_off_surt': 0,
+        'hops_off': 0,
         'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
         'job_id': job.id,
         'needs_robots_check': True,
@@ -443,8 +439,7 @@ def test_field_defaults():
     brozzler.Site.table_ensure(rr)
     site = brozzler.Site(rr, {'seed': 'http://example.com/'})
     assert site.id is None
-    assert site.scope
-    assert site.scope['surt'] == 'http://(com,example,)/'
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/'}]}
     site.save()
     assert site.id
     assert site.scope
@@ -638,11 +633,15 @@ def test_completed_page():
         'hops_from_seed': 0,
         'redirect_url':'http://example.com/b/', })
     page.save()
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     frontier.completed_page(site, page)
-    assert site.scope == {'surt': 'http://(com,example,)/b/'}
+    assert site.scope == {'accepts': [
+        {'ssurt': 'com,example,//http:/a/'},
+        {'ssurt': 'com,example,//http:/b/'}]}
     site.refresh()
-    assert site.scope == {'surt': 'http://(com,example,)/b/'}
+    assert site.scope == {'accepts': [
+        {'ssurt': 'com,example,//http:/a/'},
+        {'ssurt': 'com,example,//http:/b/'}]}
     assert page.brozzle_count == 1
     assert page.claimed == False
     page.refresh()
@@ -661,11 +660,11 @@ def test_completed_page():
         'hops_from_seed': 0,
         'redirect_url':'http://example.com/a/x/', })
     page.save()
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     frontier.completed_page(site, page)
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     site.refresh()
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     assert page.brozzle_count == 1
     assert page.claimed == False
     page.refresh()
@@ -683,11 +682,11 @@ def test_completed_page():
         'hops_from_seed': 1,
         'redirect_url':'http://example.com/d/', })
     page.save()
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     frontier.completed_page(site, page)
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     site.refresh()
-    assert site.scope == {'surt': 'http://(com,example,)/a/'}
+    assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
     assert page.brozzle_count == 1
     assert page.claimed == False
     page.refresh()
@@ -727,7 +726,7 @@ def test_hashtag_seed():
     site = brozzler.Site(rr, {'seed': 'http://example.org/'})
     brozzler.new_site(frontier, site)
 
-    assert site.scope['surt'] == 'http://(org,example,)/'
+    assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}
 
     pages = list(frontier.site_pages(site.id))
     assert len(pages) == 1
@@ -738,7 +737,7 @@ def test_hashtag_seed():
     site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
     brozzler.new_site(frontier, site)
 
-    assert site.scope['surt'] == 'http://(org,example,)/'
+    assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}
 
     pages = list(frontier.site_pages(site.id))
     assert len(pages) == 1
@@ -908,7 +907,7 @@ def test_choose_warcprox():
     svcreg = doublethink.ServiceRegistry(rr)
     frontier = brozzler.RethinkDbFrontier(rr)
 
-    # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
+    # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
     rr.table('sites').wait().run()
     rr.table('services').wait().run()
     rr.table('sites').index_wait().run()
@@ -978,3 +977,136 @@ def test_choose_warcprox():
     # clean up
     rr.table('sites').delete().run()
     rr.table('services').delete().run()
+
+def test_max_hops_off():
+    rr = doublethink.Rethinker('localhost', db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(rr)
+    site = brozzler.Site(rr, {
+        'seed': 'http://example.com/',
+        'scope': {
+            'max_hops_off_surt': 1,
+            'blocks': [{'ssurt': 'domain,bad,'}]}})
+    brozzler.new_site(frontier, site)
+    site.refresh()  # get it back from the db
+
+    # renamed this param
+    assert not 'max_hops_off_surt' in site.scope
+    assert site.scope['max_hops_off'] == 1
+
+    seed_page = frontier.seed_page(site.id)
+
+    assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
+    assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None
+    assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True
+    assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False
+
+    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
+    brozzler.is_permitted_by_robots = lambda *args: True
+    try:
+        # two of these are in scope because of max_hops_off
+        frontier.scope_and_schedule_outlinks(site, seed_page, [
+            'http://foo.org/', 'https://example.com/toot',
+            'http://example.com/toot', 'https://some.bad.domain/something'])
+    finally:
+        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
+
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+
+    assert len(pages) == 4
+    assert pages[0].url == 'http://example.com/'
+    assert pages[0].hops_off == 0
+    assert not 'hops_off_surt' in pages[0]
+    assert set(pages[0].outlinks['accepted']) == {
+            'https://example.com/toot', 'http://foo.org/',
+            'http://example.com/toot'}
+    assert pages[0].outlinks['blocked'] == []
+    assert pages[0].outlinks['rejected'] == [
+            'https://some.bad.domain/something']
+    assert {
+        'brozzle_count': 0,
+        'claimed': False,
+        'hashtags': [],
+        'hops_from_seed': 1,
+        'hops_off': 0,
+        'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'),
+        'job_id': None,
+        'needs_robots_check': False,
+        'priority': 12,
+        'site_id': site.id,
+        'url': 'http://example.com/toot',
+        'via_page_id': seed_page.id
+    } in pages
+    assert {
+        'brozzle_count': 0,
+        'claimed': False,
+        'hashtags': [],
+        'hops_from_seed': 1,
+        'hops_off': 1,
+        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
+        'job_id': None,
+        'needs_robots_check': False,
+        'priority': 12,
+        'site_id': site.id,
+        'url': 'http://foo.org/',
+        'via_page_id': seed_page.id
+    } in pages
+    assert {
+        'brozzle_count': 0,
+        'claimed': False,
+        'hashtags': [],
+        'hops_from_seed': 1,
+        'hops_off': 1,
+        'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'),
+        'job_id': None,
+        'needs_robots_check': False,
+        'priority': 12,
+        'site_id': site.id,
+        'url': 'https://example.com/toot',
+        'via_page_id': seed_page.id
+    } in pages
+
+    # next hop is past max_hops_off, but normal in scope url is in scope
+    foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
+    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
+    brozzler.is_permitted_by_robots = lambda *args: True
+    try:
+        frontier.scope_and_schedule_outlinks(site, foo_page, [
+            'http://foo.org/bar', 'http://example.com/blah'])
+    finally:
+        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
+    assert foo_page == {
+        'brozzle_count': 0,
+        'claimed': False,
+        'hashtags': [],
+        'hops_from_seed': 1,
+        'hops_off': 1,
+        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
+        'job_id': None,
+        'needs_robots_check': False,
+        'priority': 12,
+        'site_id': site.id,
+        'url': 'http://foo.org/',
+        'via_page_id': seed_page.id,
+        'outlinks': {
+            'accepted': ['http://example.com/blah'],
+            'blocked': [],
+            'rejected': ['http://foo.org/bar'],
+        }
+    }
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 5
+    assert {
+        'brozzle_count': 0,
+        'claimed': False,
+        'hashtags': [],
+        'hops_from_seed': 2,
+        'hops_off': 0,
+        'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'),
+        'job_id': None,
+        'needs_robots_check': False,
+        'priority': 11,
+        'site_id': site.id,
+        'url': 'http://example.com/blah',
+        'via_page_id': foo_page.id
+    } in pages
+
diff --git a/tests/test_units.py b/tests/test_units.py
index ce5067c..eed034e 100644
--- a/tests/test_units.py
+++ b/tests/test_units.py
@@ -94,28 +94,28 @@ blocks:
         'url': 'http://example.com/foo/bar?baz=quux#monkey',
         'site_id': site.id})
 
-    assert site.is_in_scope('http://example.com/foo/bar', page)
-    assert not site.is_in_scope('http://example.com/foo/baz', page)
+    assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
+    assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
 
-    assert not site.is_in_scope('http://foo.com/some.mp3', page)
-    assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
+    assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
+    assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
 
-    assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
-    assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
+    assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
+    assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
 
-    assert site.is_in_scope('https://twitter.com/twit', page)
-    assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
-    assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
+    assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
+    assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
+    assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
 
-    assert site.is_in_scope('https://www.facebook.com/whatevz', page)
+    assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
 
-    assert not site.is_in_scope(
-            'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
+    assert site.accept_reject_or_neither(
+            'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
     yt_user_page = brozzler.Page(None, {
         'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
         'site_id': site.id, 'hops_from_seed': 10})
-    assert site.is_in_scope(
-            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
+    assert site.accept_reject_or_neither(
+            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
 
 def test_proxy_down():
     '''