Merge branch 'master' into behavior-refactor

2025-04-21 16:16:28 -04:00 · 2017-07-21 16:24:36 -07:00 · 2017-07-21 16:24:36 -07:00 · 5c6184201f
commit 5c6184201f
parent 8524992840 a563e9eb0c
13 changed files with 467 additions and 58 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -5,6 +5,7 @@ python:
 - 3.6
 sudo: required
 dist: trusty
+group: deprecated-2017Q2  # https://blog.travis-ci.com/2017-06-21-trusty-updates-2017-Q2-launch
 before_install:
 - sudo pip install ansible==2.1.3.0
 install:
--- a/brozzler/behaviors.yaml
+++ b/brozzler/behaviors.yaml
@ -129,6 +129,13 @@
     click_css_selector: button#ird3-button-next
     click_until_hard_timeout: True
  request_idle_timeout_sec: 10
+- # https://webarchive.jira.com/browse/ARI-5389
+  url_regex: '^https?://pitchfork\.com/.*$'
+  behavior_js_template: pitchfork.js
+- # https://webarchive.jira.com/browse/ARI-5379
+  url_regex: '^https?://pm\.gc\.ca/eng/news.*$'
+  behavior_js_template: pm-ca.js
+  request_idle_timeout_sec: 10
 - # https://webarchive.jira.com/browse/ARI-4960
  url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
  behavior_js_template: simpleclicks.js.j2
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -379,7 +379,8 @@ class Browser:
            self, page_url, ignore_cert_errors=False, extra_headers=None,
            user_agent=None, behavior_parameters=None,
            on_request=None, on_response=None, on_screenshot=None,
-            username=None, password=None, hashtags=None):
+            username=None, password=None, hashtags=None,
+            skip_extract_outlinks=False, skip_visit_hashtags=False):
        '''
        Browses page in browser.

@ -447,8 +448,12 @@ class Browser:
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters)
                self.run_behavior(behavior_script, timeout=900)
-                outlinks = self.extract_outlinks()
-                self.visit_hashtags(page_url, hashtags, outlinks)
+                if skip_extract_outlinks:
+                    outlinks = []
+                else:
+                    outlinks = self.extract_outlinks()
+                if not skip_visit_hashtags:
+                    self.visit_hashtags(page_url, hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -154,6 +154,12 @@ def brozzle_page(argv=None):
            help='use this password to try to log in if a login form is found')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None, help='http proxy')
+    arg_parser.add_argument(
+            '--skip-extract-outlinks', dest='skip_extract_outlinks',
+            action='store_true', help=argparse.SUPPRESS)
+    arg_parser.add_argument(
+            '--skip-visit-hashtags', dest='skip_visit_hashtags',
+            action='store_true', help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
@ -166,7 +172,9 @@ def brozzle_page(argv=None):
        'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
        'username': args.username, 'password': args.password})
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
-    worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
+    worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
+        skip_extract_outlinks=args.skip_extract_outlinks,
+        skip_visit_hashtags=args.skip_visit_hashtags)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
@ -299,6 +307,12 @@ def brozzler_worker(argv=None):
            help=(
                'when needed, choose an available instance of warcprox from '
                'the rethinkdb service registry'))
+    arg_parser.add_argument(
+            '--skip-extract-outlinks', dest='skip_extract_outlinks',
+            action='store_true', help=argparse.SUPPRESS)
+    arg_parser.add_argument(
+            '--skip-visit-hashtags', dest='skip_visit_hashtags',
+            action='store_true', help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
@ -331,7 +345,9 @@ def brozzler_worker(argv=None):
    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=int(args.max_browsers),
            chrome_exe=args.chrome_exe, proxy=args.proxy,
-            warcprox_auto=args.warcprox_auto)
+            warcprox_auto=args.warcprox_auto,
+            skip_extract_outlinks=args.skip_extract_outlinks,
+            skip_visit_hashtags=args.skip_visit_hashtags)

    signal.signal(signal.SIGQUIT, dump_state)
    signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
@ -471,7 +487,9 @@ def brozzler_list_sites(argv=None):
    elif args.jobless:
        reql = reql.filter(~r.row.has_fields('job_id'))
    elif args.active:
-        reql = reql.filter({'status': 'ACTIVE'})
+        reql = reql.between(
+                ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
+                index='sites_last_disclaimed')
    logging.debug('querying rethinkdb: %s', reql)
    results = reql.run()
    if args.yaml:
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -135,10 +135,11 @@ class RethinkDbFrontier:

    def _enforce_time_limit(self, site):
        if (site.time_limit and site.time_limit > 0
-                and site.elapsed() > site.time_limit):
+                and (site.active_brozzling_time or 0) > site.time_limit):
            self.logger.debug(
-                    "site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
-                    site.time_limit, site.elapsed(), site)
+                    "site FINISHED_TIME_LIMIT! time_limit=%s "
+                    "active_brozzling_time=%s %s", site.time_limit,
+                    site.active_brozzling_time, site)
            self.finished(site, "FINISHED_TIME_LIMIT")
            return True
        else:
--- a/brozzler/js-templates/facebook.js
+++ b/brozzler/js-templates/facebook.js
@ -35,7 +35,7 @@ var umbraAboveBelowOrOnScreen = function(e) {
 }

 // comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
-var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
+var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a.uiMorePagerPrimary, a[href^="/browse/likes"], *[rel="theater"]';
 //div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
 var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
 var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
--- a/brozzler/js-templates/pitchfork.js
+++ b/brozzler/js-templates/pitchfork.js
@ -0,0 +1,171 @@
+/*
+ * brozzler/behaviors.d/pm-ca.js - behavior for http://pitchfork.com/festival/chicago/
+ *
+ * Copyright (C) 2014-2017 Internet Archive
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+var umbraBehavior = {
+	IDLE_TIMEOUT_SEC : 10,
+	idleSince : null,
+    itemsText : "",
+
+	// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
+	// n.b. returns true for elements with visibility:hidden, which occupy
+	// screen real estate but are not visible, or clickable with the ui
+	isVisible : function(elem) {
+		return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
+	},
+
+	intervalFunc : function() {
+		var clickedSomething = false;
+		var somethingLeftBelow = false;
+		var somethingLeftAbove = false;
+		var cssSelector = "button.performer.full-lineup";
+        var closeSelector = ".pmf-artist-modal__close-btn";
+		var clickUntilTimeout = 10;
+
+		var iframes = document.querySelectorAll("iframe");
+		var documents = Array(iframes.length + 1);
+		documents[0] = document;
+
+		for (var i = 0; i < iframes.length; i++) {
+			documents[i+1] = iframes[i].contentWindow.document;
+		}
+
+		for (var j = 0; j < documents.length; j++) {
+            var closeTargets = documents[j].querySelectorAll(closeSelector);
+			for (var i = 0; i < closeTargets.length; i++) {
+				if (!this.isVisible(closeTargets[i])) {
+					continue;
+				}
+
+				var where = this.aboveBelowOrOnScreen(closeTargets[i]);
+
+				if (where == 0) {
+					// console.log("clicking on " + clickTargets[i].outerHTML);
+					// do mouse over event on click target
+					// since some urls are requsted only on
+					// this event - see
+					// https://webarchive.jira.com/browse/AITFIVE-451
+					var mouseOverEvent = document.createEvent('Events');
+					mouseOverEvent.initEvent("mouseover",true, false);
+					closeTargets[i].dispatchEvent(mouseOverEvent);
+					closeTargets[i].click();
+					clickedSomething = true;
+					this.idleSince = null;
+
+					break; //break from closeTargets loop, but not from iframe loop
+				} else if (where > 0) {
+					somethingLeftBelow = true;
+				} else if (where < 0) {
+					somethingLeftAbove = true;
+				}
+			}
+
+			var clickTargets = documents[j].querySelectorAll(cssSelector);
+			for (var i = 0; i < clickTargets.length; i++) {
+				if (!this.isVisible(clickTargets[i])) {
+					continue;
+				}
+				if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
+					continue;
+				}
+
+				var where = this.aboveBelowOrOnScreen(clickTargets[i]);
+
+				if (where == 0) {
+					// console.log("clicking on " + clickTargets[i].outerHTML);
+					// do mouse over event on click target
+					// since some urls are requsted only on
+					// this event - see
+					// https://webarchive.jira.com/browse/AITFIVE-451
+					var mouseOverEvent = document.createEvent('Events');
+					mouseOverEvent.initEvent("mouseover",true, false);
+					clickTargets[i].dispatchEvent(mouseOverEvent);
+					clickTargets[i].click();
+					clickedSomething = true;
+					this.idleSince = null;
+					this.itemsText += clickTargets[i].innerText;
+
+					break; //break from clickTargets loop, but not from iframe loop
+				} else if (where > 0) {
+					somethingLeftBelow = true;
+				} else if (where < 0) {
+					somethingLeftAbove = true;
+				}
+			}
+		}
+
+		if (!clickedSomething) {
+			if (somethingLeftAbove) {
+				// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
+				window.scrollBy(0, -500);
+				this.idleSince = null;
+			} else if (somethingLeftBelow) {
+				// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
+				//				+ document.body.clientHeight);
+				window.scrollBy(0, 200);
+				this.idleSince = null;
+			} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
+				// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
+				//				+ document.body.clientHeight);
+				window.scrollBy(0, 200);
+				this.idleSince = null;
+			} else if (this.idleSince == null) {
+				this.idleSince = Date.now();
+			}
+		}
+
+		if (!this.idleSince) {
+			this.idleSince = Date.now();
+		}
+	},
+
+	start : function() {
+		var that = this;
+		this.intervalId = setInterval(function() {
+			that.intervalFunc()
+		}, 500);
+	},
+
+	isFinished : function() {
+		if (this.idleSince != null) {
+			var idleTimeMs = Date.now() - this.idleSince;
+			if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
+				clearInterval(this.intervalId);
+				return true;
+			}
+		}
+		return false;
+	},
+
+	aboveBelowOrOnScreen : function(e) {
+		var eTop = e.getBoundingClientRect().top;
+		if (eTop < window.scrollY) {
+			return -1; // above
+		} else if (eTop > window.scrollY + window.innerHeight) {
+			return 1; // below
+		} else {
+			return 0; // on screen
+		}
+	},
+};
+
+// Called from outside of this script.
+var umbraBehaviorFinished = function() {
+	return umbraBehavior.isFinished()
+};
+
+umbraBehavior.start();
--- a/brozzler/js-templates/pm-ca.js
+++ b/brozzler/js-templates/pm-ca.js
@ -0,0 +1,141 @@
+/*
+ * brozzler/behaviors.d/pm-ca.js - behavior for http://pm.gc.ca/eng/news
+ *
+ * Copyright (C) 2014-2017 Internet Archive
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+var umbraBehavior = {
+	IDLE_TIMEOUT_SEC : 10,
+	idleSince : null,
+    itemsText : "",
+
+	// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
+	// n.b. returns true for elements with visibility:hidden, which occupy
+	// screen real estate but are not visible, or clickable with the ui
+	isVisible : function(elem) {
+		return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
+	},
+
+	intervalFunc : function() {
+		var clickedSomething = false;
+		var somethingLeftBelow = false;
+		var somethingLeftAbove = false;
+		var cssSelector = "div.teaser";
+		var clickUntilTimeout = 10;
+
+		var iframes = document.querySelectorAll("iframe");
+		var documents = Array(iframes.length + 1);
+		documents[0] = document;
+
+		for (var i = 0; i < iframes.length; i++) {
+			documents[i+1] = iframes[i].contentWindow.document;
+		}
+
+		for (var j = 0; j < documents.length; j++) {
+			var clickTargets = documents[j].querySelectorAll(cssSelector);
+			for (var i = 0; i < clickTargets.length; i++) {
+				if (!this.isVisible(clickTargets[i])) {
+					continue;
+				}
+				if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
+					continue;
+				}
+
+				var where = this.aboveBelowOrOnScreen(clickTargets[i]);
+
+				if (where == 0) {
+					// console.log("clicking on " + clickTargets[i].outerHTML);
+					// do mouse over event on click target
+					// since some urls are requsted only on
+					// this event - see
+					// https://webarchive.jira.com/browse/AITFIVE-451
+					var mouseOverEvent = document.createEvent('Events');
+					mouseOverEvent.initEvent("mouseover",true, false);
+					clickTargets[i].dispatchEvent(mouseOverEvent);
+					clickTargets[i].click();
+					clickedSomething = true;
+					this.idleSince = null;
+					this.itemsText += clickTargets[i].innerText;
+
+					break; //break from clickTargets loop, but not from iframe loop
+				} else if (where > 0) {
+					somethingLeftBelow = true;
+				} else if (where < 0) {
+					somethingLeftAbove = true;
+				}
+			}
+		}
+
+		if (!clickedSomething) {
+			if (somethingLeftAbove) {
+				// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
+				window.scrollBy(0, -500);
+				this.idleSince = null;
+			} else if (somethingLeftBelow) {
+				// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
+				//				+ document.body.clientHeight);
+				window.scrollBy(0, 200);
+				this.idleSince = null;
+			} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
+				// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
+				//				+ document.body.clientHeight);
+				window.scrollBy(0, 200);
+				this.idleSince = null;
+			} else if (this.idleSince == null) {
+				this.idleSince = Date.now();
+			}
+		}
+
+		if (!this.idleSince) {
+			this.idleSince = Date.now();
+		}
+	},
+
+	start : function() {
+		var that = this;
+		this.intervalId = setInterval(function() {
+			that.intervalFunc()
+		}, 500);
+	},
+
+	isFinished : function() {
+		if (this.idleSince != null) {
+			var idleTimeMs = Date.now() - this.idleSince;
+			if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
+				clearInterval(this.intervalId);
+				return true;
+			}
+		}
+		return false;
+	},
+
+	aboveBelowOrOnScreen : function(e) {
+		var eTop = e.getBoundingClientRect().top;
+		if (eTop < window.scrollY) {
+			return -1; // above
+		} else if (eTop > window.scrollY + window.innerHeight) {
+			return 1; // below
+		} else {
+			return 0; // on screen
+		}
+	},
+};
+
+// Called from outside of this script.
+var umbraBehaviorFinished = function() {
+	return umbraBehavior.isFinished()
+};
+
+umbraBehavior.start();
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -119,7 +119,15 @@ def new_site(frontier, site):

 class ElapsedMixIn(object):
    def elapsed(self):
-        '''Returns elapsed crawl time as a float in seconds.'''
+        '''
+        Returns elapsed crawl time as a float in seconds.
+
+        This metric includes all the time that a site was in active rotation,
+        including any time it spent waiting for its turn to be brozzled.
+
+        In contrast `Site.active_brozzling_time` only counts time when a
+        brozzler worker claimed the site and was actively brozzling it.
+        '''
        dt = 0
        for ss in self.starts_and_stops[:-1]:
            dt += (ss['stop'] - ss['start']).total_seconds()
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -36,6 +36,7 @@ import tempfile
 import urlcanon
 from requests.structures import CaseInsensitiveDict
 import rethinkdb as r
+import datetime

 class ExtraHeaderAdder(urllib.request.BaseHandler):
    def __init__(self, extra_headers):
@ -102,7 +103,8 @@ class BrozzlerWorker:

    def __init__(
            self, frontier, service_registry=None, max_browsers=1,
-            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
+            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
+            skip_extract_outlinks=False, skip_visit_hashtags=False):
        self._frontier = frontier
        self._service_registry = service_registry
        self._max_browsers = max_browsers
@ -111,6 +113,8 @@ class BrozzlerWorker:
        self._proxy = proxy
        assert not (warcprox_auto and proxy)
        self._proxy_is_warcprox = None
+        self._skip_extract_outlinks = skip_extract_outlinks
+        self._skip_visit_hashtags = skip_visit_hashtags

        self._browser_pool = brozzler.browser.BrowserPool(
                max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -156,7 +160,23 @@ class BrozzlerWorker:
        else:
            return bool(site.proxy or self._warcprox_auto)

+
    def _youtube_dl(self, destdir, site):
+        def ydl_progress(*args, **kwargs):
+            # in case youtube-dl takes a long time, heartbeat site.last_claimed
+            # to prevent another brozzler-worker from claiming the site
+            try:
+                if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7):
+                    self.logger.debug(
+                            'heartbeating site.last_claimed to prevent another '
+                            'brozzler-worker claiming this site id=%r', site.id)
+                    site.last_claimed = doublethink.utcnow()
+                    site.save()
+            except:
+                self.logger.debug(
+                        'problem heartbeating site.last_claimed site id=%r',
+                        site.id, exc_info=True)
+
        ydl_opts = {
            "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
            "verbose": False,
@ -167,6 +187,11 @@ class BrozzlerWorker:
            "noprogress": True,
            "nopart": True,
            "no_color": True,
+            "progress_hooks": [ydl_progress],
+             # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
+             # "best: Select the best quality format represented by a single
+             # file with video and audio."
+            "format": "best/bestvideo+bestaudio",
        }
        if self._proxy_for(site):
            ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
@ -384,7 +409,9 @@ class BrozzlerWorker:
                username=site.get('username'), password=site.get('password'),
                user_agent=site.get('user_agent'),
                on_screenshot=_on_screenshot, on_response=_on_response,
-                hashtags=page.hashtags)
+                hashtags=page.hashtags,
+                skip_extract_outlinks=self._skip_extract_outlinks,
+                skip_visit_hashtags=self._skip_visit_hashtags)
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
@ -425,12 +452,12 @@ class BrozzlerWorker:

    def brozzle_site(self, browser, site):
        try:
+            start = time.time()
            page = None
            self._frontier.honor_stop_request(site)
            self.logger.info(
                    "brozzling site (proxy=%r) %r",
                    self._proxy_for(site), site)
-            start = time.time()
            while time.time() - start < 7 * 60:
                site.refresh()
                self._frontier.honor_stop_request(site)
@ -477,6 +504,8 @@ class BrozzlerWorker:
        except:
            self.logger.critical("unexpected exception", exc_info=True)
        finally:
+            if start:
+                site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
            self._frontier.disclaim_site(site, page)

    def _brozzle_site_thread_target(self, browser, site):
--- a/job-conf.rst
+++ b/job-conf.rst
@ -12,7 +12,6 @@ an example

    id: myjob
    time_limit: 60 # seconds
-    proxy: 127.0.0.1:8000 # point at warcprox for archiving
    ignore_robots: false
    warcprox_meta:
      warc-prefix: job1
@ -82,8 +81,8 @@ Notice that:
 settings reference
 ==================

-id
--
+``id``
+------
 +-----------+--------+----------+--------------------------+
 | scope     | type   | required | default                  |
 +===========+========+==========+==========================+
@ -92,8 +91,8 @@ id
 An arbitrary identifier for this job. Must be unique across this deployment of
 brozzler.

-seeds
-----
+``seeds``
+---------
 +-----------+------------------------+----------+---------+
 | scope     | type                   | required | default |
 +===========+========================+==========+=========+
@ -103,8 +102,8 @@ List of seeds. Each item in the list is a dictionary (associative array) which
 defines the seed. It must specify ``url`` (see below) and can additionally
 specify any of the settings of scope *seed-level*.

-url
---
+``url``
+-------
 +------------+--------+----------+---------+
 | scope      | type   | required | default |
 +============+========+==========+=========+
@ -112,8 +111,11 @@ url
 +------------+--------+----------+---------+
 The seed url.

-time_limit
----------
+``metadata``
+------------
+
+``time_limit``
+--------------
 +-----------------------+--------+----------+---------+
 | scope                 | type   | required | default |
 +=======================+========+==========+=========+
@ -124,28 +126,18 @@ enforced at the seed level. If a time limit is specified at the top level, it
 is inherited by each seed as described above, and enforced individually on each
 seed.

-proxy
-----
-+-----------------------+--------+----------+---------+
-| scope                 | type   | required | default |
-+=======================+========+==========+=========+
-| seed-level, top-level | string | no       | *none*  |
-+-----------------------+--------+----------+---------+
-HTTP proxy, with the format ``host:port``. Typically configured to point to
-warcprox for archival crawling.
-
-ignore_robots
-------------
-+-----------------------+---------+----------+---------+
-| scope                 | type    | required | default |
-+=======================+=========+==========+=========+
-| seed-level, top-level | boolean | no       | false   |
-+-----------------------+---------+----------+---------+
+``ignore_robots``
+-----------------
+-----------------------+---------+----------+-----------+
+| scope                 | type    | required | default   |
+=======================+=========+==========+===========+
+| seed-level, top-level | boolean | no       | ``false`` |
+-----------------------+---------+----------+-----------+
 If set to ``true``, brozzler will happily crawl pages that would otherwise be
 blocked by robots.txt rules.

-user_agent
----------
+``user_agent``
+--------------
 +-----------------------+---------+----------+---------+
 | scope                 | type    | required | default |
 +=======================+=========+==========+=========+
@ -156,13 +148,13 @@ It's good ettiquette to include a project URL with a notice to webmasters that
 explains why you're crawling, how to block the crawler robots.txt and how to
 contact the operator if the crawl is causing problems.

-warcprox_meta
-------------
-+-----------------------+------------+----------+---------+
-| scope                 | type       | required | default |
-+=======================+============+==========+=========+
-| seed-level, top-level | dictionary | no       | false   |
-+-----------------------+------------+----------+---------+
+``warcprox_meta``
+-----------------
+-----------------------+------------+----------+-----------+
+| scope                 | type       | required | default   |
+=======================+============+==========+===========+
+| seed-level, top-level | dictionary | no       | ``false`` |
+-----------------------+------------+----------+-----------+
 Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
 configured. The value of the Warcprox-Meta header is a json blob. It is used to
 pass settings and information to warcprox. Warcprox does not forward the header
@ -183,11 +175,37 @@ becomes::

    Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}

-scope
-----
-+-----------------------+------------+----------+---------+
-| scope                 | type       | required | default |
-+=======================+============+==========+=========+
-| seed-level, top-level | dictionary | no       | false   |
-+-----------------------+------------+----------+---------+
+``scope``
+---------
+-----------------------+------------+----------+-----------+
+| scope                 | type       | required | default   |
+=======================+============+==========+===========+
+| seed-level, top-level | dictionary | no       | ``false`` |
+-----------------------+------------+----------+-----------+
 Scope rules. *TODO*
+
+``surt``
+--------
+-------------+--------+----------+---------------------------+
+| scope       | type   | required | default                   |
+=============+========+==========+===========================+
+| scope-level | string | no       | *generated from seed url* |
+-------------+--------+----------+---------------------------+
+
+``accepts``
+-----------
+-------------+------+----------+---------+
+| scope       | type | required | default |
+=============+======+==========+=========+
+| scope-level | list | no       | *none*  |
+-------------+------+----------+---------+
+
+``blocks``
+-----------
+-------------+------+----------+---------+
+| scope       | type | required | default |
+=============+======+==========+=========+
+| scope-level | list | no       | *none*  |
+-------------+------+----------+---------+
+
+
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b12.dev257',
+        version='1.1b12.dev265',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -238,6 +238,9 @@ def test_resume_job():
    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']

 def test_time_limit():
+    # XXX test not thoroughly adapted to change in time accounting, since
+    # starts_and_stops is no longer used to enforce time limits
+
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
@ -277,9 +280,16 @@ def test_time_limit():
    site.claimed = True
    site.save()

-    time.sleep(0.1)
+    # time limit not reached yet
    frontier._enforce_time_limit(site)
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 2
+    assert site.starts_and_stops[1]['start']
+    assert site.starts_and_stops[1]['stop'] is None

+    site.active_brozzling_time = 0.2  # this is why the time limit will be hit
+
+    frontier._enforce_time_limit(site)
    assert site.status == 'FINISHED_TIME_LIMIT'
    assert not site.claimed
    assert len(site.starts_and_stops) == 2