Merge branch 'qa' of github.com:internetarchive/brozzler into qa

2025-04-21 08:06:27 -04:00 · 2017-02-05 21:49:40 -08:00 · 2017-02-05 21:49:40 -08:00 · 0ff6257bef
commit 0ff6257bef
parent 6bf8cfe893 aa22594928
16 changed files with 453 additions and 63 deletions
--- a/README.rst
+++ b/README.rst
@ -1,10 +1,10 @@
 .. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
    :target: https://travis-ci.org/internetarchive/brozzler
-    
+
 .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
   :width: 7%

-|logo| brozzler 
+|logo| brozzler
 ===============
 "browser" \| "crawler" = "brozzler"

@ -97,9 +97,8 @@ Job Configuration
 -----------------

 Jobs are defined using yaml files. Options may be specified either at the
-top-level or on individual seeds. A job id and at least one seed url
-must be specified, everything else is optional. For details, see
-`<job-conf.rst>`_.
+top-level or on individual seeds. At least one seed url must be specified,
+everything else is optional. For details, see `<job-conf.rst>`_.

 ::

@ -139,6 +138,46 @@ To start the app, run

 See ``brozzler-dashboard --help`` for configuration options.

+Brozzler Wayback
+----------------
+
+Brozzler comes with a customized version of
+`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
+"captures" table (populated by warcprox) as its index.
+
+To use, first install dependencies.
+
+::
+
+    pip install brozzler[easy]
+
+Write a configuration file pywb.yml.
+
+::
+
+    # 'archive_paths' should point to the output directory of warcprox
+    archive_paths: warcs/  # pywb will fail without a trailing slash
+    collections:
+      brozzler:
+        index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
+          db: brozzler
+          table: captures
+          servers:
+          - localhost
+    enable_auto_colls: false
+    enable_cdx_api: true
+    framed_replay: true
+    port: 8880
+
+Run pywb like so:
+
+::
+
+    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
+
+Then browse http://localhost:8880/brozzler/.
+
+
 Headless Chromium
 -----------------

@ -198,7 +237,7 @@ option:
    brozzler-worker --chrome-exe ~/bin/headless_chromium.sh

 To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
-and extract the Linux (.tar.gz) PPAPI plugin.  Configure Headless Chromium 
+and extract the Linux (.tar.gz) PPAPI plugin.  Configure Headless Chromium
 to load the plugin by adding this option to your wrapper script:

 ::
@ -208,7 +247,7 @@ to load the plugin by adding this option to your wrapper script:
 License
 -------

-Copyright 2015-2016 Internet Archive
+Copyright 2015-2017 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License"); you may
 not use this software except in compliance with the License. You may
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -64,19 +64,33 @@ class BaseDictable:
    def __repr__(self):
        return "{}(**{})".format(self.__class__.__name__, self.to_dict())

-def fixup(url):
+def fixup(url, hash_strip=False):
    '''
    Does rudimentary canonicalization, such as converting IDN to punycode.
    '''
    import surt
    hurl = surt.handyurl.parse(url)
+    if hash_strip:
+        hurl.hash = None
    # handyurl.parse() already lowercases the scheme via urlsplit
    if hurl.host:
        hurl.host = hurl.host.encode('idna').decode('ascii').lower()
    return hurl.getURLString()

-# logging level more fine-grained than logging.DEBUG==10
+# monkey-patch log level TRACE
 TRACE = 5
+import logging as _logging
+def _logging_trace(msg, *args, **kwargs):
+    if len(_logging.root.handlers) == 0:
+        basicConfig()
+    _logging.root.trace(msg, *args, **kwargs)
+def _logger_trace(self, msg, *args, **kwargs):
+    if self.isEnabledFor(TRACE):
+        self._log(TRACE, msg, args, **kwargs)
+_logging.trace = _logging_trace
+_logging.Logger.trace = _logger_trace
+_logging._levelToName[TRACE] = 'TRACE'
+_logging._nameToLevel['TRACE'] = TRACE

 _behaviors = None
 def behaviors():
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -200,8 +200,7 @@ class WebsockReceiverThread(threading.Thread):
            else:
                self.logger.info(
                        'reached limit but self.reached_limit is already set, '
-                        'assuming the calling thread is already handling this',
-                        self.reached_limit)
+                        'assuming the calling thread is already handling this')
        if self.on_response:
            self.on_response(message)

--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -231,11 +231,11 @@ class Chrome:
                            b'CERT_PKIXVerifyCert for [^ ]* failed|'
                            b'^ALSA lib|ERROR:gl_surface_glx.cc|'
                            b'ERROR:gpu_child_thread.cc', buf):
-                        logging.log(
-                                brozzler.TRACE, 'chrome pid %s STDOUT %s',
+                        self.logger.trace(
+                                'chrome pid %s STDOUT %s',
                                self.chrome_process.pid, buf)
                    else:
-                        logging.debug(
+                        self.logger.debug(
                                'chrome pid %s STDOUT %s',
                                self.chrome_process.pid, buf)

@ -246,15 +246,15 @@ class Chrome:
                            b'CERT_PKIXVerifyCert for [^ ]* failed|'
                            b'^ALSA lib|ERROR:gl_surface_glx.cc|'
                            b'ERROR:gpu_child_thread.cc', buf):
-                        logging.log(
-                                brozzler.TRACE, 'chrome pid %s STDOUT %s',
+                        self.logger.trace(
+                                'chrome pid %s STDOUT %s',
                                self.chrome_process.pid, buf)
                    else:
-                        logging.debug(
+                        self.logger.debug(
                                'chrome pid %s STDERR %s',
                                self.chrome_process.pid, buf)
        except:
-            logging.error('unexpected exception', exc_info=True)
+            self.logger.error('unexpected exception', exc_info=True)

    def stop(self):
        if not self.chrome_process or self._shutdown.is_set():
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -91,8 +91,7 @@ def _add_proxy_options(arg_parser):

 def configure_logging(args):
    logging.basicConfig(
-            stream=sys.stderr, level=args.log_level,
-            format=(
+            stream=sys.stderr, level=args.log_level, format=(
                '%(asctime)s %(process)d %(levelname)s %(threadName)s '
                '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
    logging.getLogger('requests.packages.urllib3').setLevel(logging.WARN)
@ -285,7 +284,7 @@ def brozzler_new_site():
                args.behavior_parameters) if args.behavior_parameters else None,
            username=args.username, password=args.password)

-    r = rethinker()
+    r = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)

--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -149,6 +149,7 @@ class BrozzlerEasyController:
        brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
        brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
        brozzler.pywb.support_in_progress_warcs()
+        brozzler.pywb.monkey_patch_wburl()

        if args.warcs_dir.endswith('/'):
            warcs_dir = args.warcs_dir
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -1,7 +1,7 @@
 '''
 brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages

-Copyright (C) 2014-2016 Internet Archive
+Copyright (C) 2014-2017 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -99,6 +99,7 @@ class RethinkDbFrontier:
        if not job.id:
            # only if "id" has not already been set
            job.id = result["generated_keys"][0]
+        return job

    def new_site(self, site):
        self.logger.info("inserting into 'sites' table %s", site)
@ -119,7 +120,7 @@ class RethinkDbFrontier:
        self._vet_result(result, replaced=[0,1], unchanged=[0,1])

    def update_page(self, page):
-        self.logger.debug("updating 'pages' table entry %s", page)
+        self.logger.trace("updating 'pages' table entry %s", page)
        result = self.r.table("pages").get(page.id).replace(page.to_dict()).run()
        self._vet_result(result, replaced=[0,1], unchanged=[0,1])

@ -176,9 +177,10 @@ class RethinkDbFrontier:

    def _enforce_time_limit(self, site):
        if (site.time_limit and site.time_limit > 0
-                and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
-            self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
-                    site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
+                and site.elapsed() > site.time_limit):
+            self.logger.debug(
+                    "site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
+                    site.time_limit, site.elapsed(), site)
            self.finished(site, "FINISHED_TIME_LIMIT")
            return True
        else:
@ -276,14 +278,16 @@ class RethinkDbFrontier:
            n += 1

        self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
-        job.status = "FINISHED"
-        job.finished = rethinkstuff.utcnow()
+        job.finish()
        self.update_job(job)
        return True

    def finished(self, site, status):
        self.logger.info("%s %s", status, site)
        site.status = status
+        site.claimed = False
+        site.last_disclaimed = rethinkstuff.utcnow()
+        site.starts_and_stops[-1]["stop"] = rethinkstuff.utcnow()
        self.update_site(site)
        if site.job_id:
            self._maybe_finish_job(site.job_id)
@ -300,6 +304,30 @@ class RethinkDbFrontier:
            page.claimed = False
            self.update_page(page)

+    def resume_job(self, job):
+        job.status = "ACTIVE"
+        job.starts_and_stops.append(
+                {"start":rethinkstuff.utcnow(), "stop":None})
+        self.update_job(job)
+        for site in self.job_sites(job.id):
+            site.status = "ACTIVE"
+            site.starts_and_stops.append(
+                    {"start":rethinkstuff.utcnow(), "stop":None})
+            self.update_site(site)
+
+    def resume_site(self, site):
+        if site.job_id:
+            # can't call resume_job since that would resume jobs's other sites
+            job = self.job(site.job_id)
+            job.status = "ACTIVE"
+            job.starts_and_stops.append(
+                    {"start":rethinkstuff.utcnow(), "stop":None})
+            self.update_job(job)
+        site.status = "ACTIVE"
+        site.starts_and_stops.append(
+                {"start":rethinkstuff.utcnow(), "stop":None})
+        self.update_site(site)
+
    def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
        if site.remember_outlinks:
            parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}
--- a/brozzler/job.py
+++ b/brozzler/job.py
@ -2,7 +2,7 @@
 brozzler/job.py - Job class representing a brozzler crawl job, and functions
 for setting up a job with supplied configuration

-Copyright (C) 2014-2016 Internet Archive
+Copyright (C) 2014-2017 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -61,17 +61,22 @@ def merge(a, b):
        return a

 def new_job_file(frontier, job_conf_file):
+    '''Returns new Job.'''
    logging.info("loading %s", job_conf_file)
    with open(job_conf_file) as f:
        job_conf = yaml.load(f)
-        new_job(frontier, job_conf)
+        return new_job(frontier, job_conf)

 def new_job(frontier, job_conf):
+    '''Returns new Job.'''
    validate_conf(job_conf)
    job = Job(
            id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
            started=rethinkstuff.utcnow())

+    # insert the job now to make sure it has an id
+    job = frontier.new_job(job)
+
    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
                password=merged_conf.get("password"))
        sites.append(site)

-    # insert all the sites into database before the job
    for site in sites:
        new_site(frontier, site)

-    frontier.new_job(job)
+    return job

 def new_site(frontier, site):
    site.id = str(uuid.uuid4())
@ -120,14 +124,29 @@ def new_site(frontier, site):
 class Job(brozzler.BaseDictable):
    logger = logging.getLogger(__module__ + "." + __qualname__)

-    def __init__(self, id=None, conf=None, status="ACTIVE", started=None,
-                 finished=None, stop_requested=None):
+    def __init__(
+            self, id=None, conf=None, status="ACTIVE", started=None,
+            finished=None, stop_requested=None, starts_and_stops=None):
        self.id = id
        self.conf = conf
        self.status = status
-        self.started = started
-        self.finished = finished
        self.stop_requested = stop_requested
+        self.starts_and_stops = starts_and_stops
+        if not self.starts_and_stops:
+            if started:   # backward compatibility
+                self.starts_and_stops = [{"start":started,"stop":finished}]
+            else:
+                self.starts_and_stops = [
+                        {"start":rethinkstuff.utcnow(),"stop":None}]
+
+    def finish(self):
+        if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
+            self.logger.error(
+                    "job is already finished status=%s "
+                    "starts_and_stops[-1]['stop']=%s", self.status,
+                    self.starts_and_stops[-1]["stop"])
+        self.status = "FINISHED"
+        self.starts_and_stops[-1]["stop"] = rethinkstuff.utcnow()

    def __str__(self):
        return 'Job(id=%s)' % self.id
--- a/brozzler/job_schema.yaml
+++ b/brozzler/job_schema.yaml
@ -2,7 +2,7 @@ id:
  type:
  - string
  - integer
-  required: true
+  required: false

 <<: &multi_level_options
  time_limit:
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -1,9 +1,9 @@
 '''
 brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
-loading from warcs still being written to, and canonicalization rules matching
-brozzler conventions
+loading from warcs still being written to, canonicalization rules matching
+brozzler conventions, support for screenshot: and thumbnail: urls

-Copyright (C) 2016 Internet Archive
+Copyright (C) 2016-2017 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -26,6 +26,8 @@ try:
    import pywb.cdx.cdxobject
    import pywb.cdx.cdxserver
    import pywb.webapp.query_handler
+    import pywb.framework.basehandlers
+    import pywb.rewrite.wburl
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
@ -37,6 +39,7 @@ import rethinkdb
 import surt
 import json
 import brozzler
+import argparse

 class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
    def __init__(self, servers, db, table):
@ -65,7 +68,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
                'url': record['url'],
                'status': str(record['response_code']),
                'digest': record['sha1base32'],
-                'length': str(record['record_length']),
+                'length': str(record.get('record_length', '-')),
                'offset': str(record['offset']),
                'filename': record['filename'],
            }
@ -120,8 +123,7 @@ class TheGoodUrlCanonicalizer(object):
            # logging.debug('%s -> %s', url, key)
            return key
        except Exception as e:
-            raise pywb.utils.canonicalize.UrlCanonicalizeException(
-                    'Invalid Url: ' + url)
+            return url

    def replace_default_canonicalizer():
        '''Replace parent class of CustomUrlCanonicalizer with this class.'''
@ -193,11 +195,90 @@ def support_in_progress_warcs():
        return results
    pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call

+class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
+    def __init__(self, orig_url):
+        import re
+        import six
+
+        from six.moves.urllib.parse import urlsplit, urlunsplit
+        from six.moves.urllib.parse import quote_plus, quote, unquote_plus
+
+        from pywb.utils.loaders import to_native_str
+        from pywb.rewrite.wburl import WbUrl
+
+        pywb.rewrite.wburl.BaseWbUrl.__init__(self)
+
+        if six.PY2 and isinstance(orig_url, six.text_type):
+            orig_url = orig_url.encode('utf-8')
+            orig_url = quote(orig_url)
+
+        self._original_url = orig_url
+
+        if not self._init_query(orig_url):
+            if not self._init_replay(orig_url):
+                raise Exception('Invalid WbUrl: ', orig_url)
+
+        new_uri = WbUrl.to_uri(self.url)
+
+        self._do_percent_encode = True
+
+        self.url = new_uri
+
+        # begin brozzler changes
+        if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
+                or self.url.startswith('thumbnail:')):
+            return
+        # end brozzler changes
+
+        # protocol agnostic url -> http://
+        # no protocol -> http://
+        #inx = self.url.find('://')
+        inx = -1
+        m = self.SCHEME_RX.match(self.url)
+        if m:
+            inx = m.span(1)[0]
+
+        #if inx < 0:
+            # check for other partially encoded variants
+        #    m = self.PARTIAL_ENC_RX.match(self.url)
+        #    if m:
+        #        len_ = len(m.group(0))
+        #        self.url = (urllib.unquote_plus(self.url[:len_]) +
+        #                    self.url[len_:])
+        #        inx = self.url.find(':/')
+
+        if inx < 0:
+            self.url = self.DEFAULT_SCHEME + self.url
+        else:
+            inx += 2
+            if inx < len(self.url) and self.url[inx] != '/':
+                self.url = self.url[:inx] + '/' + self.url[inx:]
+
+def _get_wburl_type(self):
+    return SomeWbUrl
+
+def monkey_patch_wburl():
+    pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
+
+class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
+    def _extend_parser(self, arg_parser):
+        super()._extend_parser(arg_parser)
+        arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
+        arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
+        arg_parser.epilog = '''
+Run pywb like so:
+
+    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
+
+See README.rst for more information.
+'''
+
 def main(argv=sys.argv):
    brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
    brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
    brozzler.pywb.support_in_progress_warcs()
-    wayback_cli = pywb.apps.cli.WaybackCli(
+    brozzler.pywb.monkey_patch_wburl()
+    wayback_cli = BrozzlerWaybackCli(
            args=argv[1:], default_port=8880,
            desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
                  'with brozzler)'))
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -99,7 +99,7 @@ class Site(brozzler.BaseDictable):
            last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
            last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
            cookie_db=None, user_agent=None, behavior_parameters=None,
-            username=None, password=None):
+            username=None, password=None, starts_and_stops=None):

        self.seed = seed
        self.id = id
@ -113,7 +113,6 @@ class Site(brozzler.BaseDictable):
        self.status = status
        self.claimed = bool(claimed)
        self.last_claimed_by = last_claimed_by
-        self.start_time = start_time or rethinkstuff.utcnow()
        self.last_disclaimed = last_disclaimed
        self.last_claimed = last_claimed
        self.metadata = metadata
@ -123,11 +122,32 @@ class Site(brozzler.BaseDictable):
        self.behavior_parameters = behavior_parameters
        self.username = username
        self.password = password
+        self.starts_and_stops = starts_and_stops
+        if not self.starts_and_stops:
+            if start_time:   # backward compatibility
+                self.starts_and_stops = [{"start":start_time,"stop":None}]
+                if self.status != "ACTIVE":
+                    self.starts_and_stops[0]["stop"] = self.last_disclaimed
+            else:
+                self.starts_and_stops = [
+                        {"start":rethinkstuff.utcnow(),"stop":None}]

        self.scope = scope or {}
        if not "surt" in self.scope:
            self.scope["surt"] = Url(seed).surt

+    def elapsed(self):
+        '''Returns elapsed crawl time as a float in seconds.'''
+        dt = 0
+        for ss in self.starts_and_stops[:-1]:
+            dt += (ss['stop'] - ss['start']).total_seconds()
+        ss = self.starts_and_stops[-1]
+        if ss['stop']:
+            dt += (ss['stop'] - ss['start']).total_seconds()
+        else: # crawl is active
+            dt += (rethinkstuff.utcnow() - ss['start']).total_seconds()
+        return dt
+
    def __str__(self):
        return "Site-%s-%s" % (self.id, self.seed)

--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -237,17 +237,20 @@ class BrozzlerWorker:
            if on_screenshot:
                on_screenshot(screenshot_png)
            elif self._proxy(site) and self._enable_warcprox_features(site):
-                self.logger.info("sending WARCPROX_WRITE_RECORD request "
-                                 "to warcprox with screenshot for %s", page)
+                self.logger.info(
+                        "sending WARCPROX_WRITE_RECORD request to %s with "
+                        "screenshot for %s", self._proxy(site), page)
                screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
                        screenshot_png)
-                self._warcprox_write_record(warcprox_address=self._proxy(site),
-                        url="screenshot:%s" % brozzler.fixup(page.url),
+                self._warcprox_write_record(
+                        warcprox_address=self._proxy(site),
+                        url="screenshot:%s" % brozzler.fixup(page.url, True),
                        warc_type="resource", content_type="image/jpeg",
                        payload=screenshot_jpeg,
                        extra_headers=site.extra_headers())
-                self._warcprox_write_record(warcprox_address=self._proxy(site),
-                        url="thumbnail:%s" % brozzler.fixup(page.url),
+                self._warcprox_write_record(
+                        warcprox_address=self._proxy(site),
+                        url="thumbnail:%s" % brozzler.fixup(page.url, True),
                        warc_type="resource", content_type="image/jpeg",
                        payload=thumbnail_jpeg,
                        extra_headers=site.extra_headers())
@ -380,9 +383,8 @@ class BrozzlerWorker:

        try:
            self.status_info = self._service_registry.heartbeat(status_info)
-            self.logger.log(
-                    brozzler.TRACE, "status in service registry: %s",
-                    self.status_info)
+            self.logger.trace(
+                    "status in service registry: %s", self.status_info)
        except rethinkdb.ReqlError as e:
            self.logger.error(
                    "failed to send heartbeat and update service registry "
--- a/job-conf.rst
+++ b/job-conf.rst
@ -2,8 +2,8 @@ brozzler job configuration
 **************************

 Jobs are defined using yaml files. Options may be specified either at the
-top-level or on individual seeds. A job id and at least one seed url
-must be specified, everything else is optional.
+top-level or on individual seeds. At least one seed url must be specified,
+everything else is optional.

 an example
 ==========
@ -85,11 +85,11 @@ settings reference

 id
 --
-+-----------+--------+----------+---------+
-| scope     | type   | required | default |
-+===========+========+==========+=========+
-| top-level | string | yes?     | *n/a*   |
-+-----------+--------+----------+---------+
+-----------+--------+----------+--------------------------+
+| scope     | type   | required | default                  |
+===========+========+==========+==========================+
+| top-level | string | no       | *generated by rethinkdb* |
+-----------+--------+----------+--------------------------+
 An arbitrary identifier for this job. Must be unique across this deployment of
 brozzler.

--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b9.dev176',
+        version='1.1b9.dev185',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -155,6 +155,20 @@ def test_brozzle_site(httpd):
        os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload

+    url = 'screenshot:%s' % page1
+    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
+    response = requests.get(wb_url)
+    assert response.status_code == 200
+    assert response.headers['content-type'] == 'image/jpeg'
+
+    url = 'thumbnail:%s' % page1
+    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
+    response = requests.get(wb_url)
+    assert response.status_code == 200
+    assert response.headers['content-type'] == 'image/jpeg'
+
 def test_warcprox_selection(httpd):
    ''' When enable_warcprox_features is true, brozzler is expected to choose
    and instance of warcprox '''
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -0,0 +1,174 @@
+#!/usr/bin/env python
+'''
+test_frontier.py - fairly narrow tests of frontier management, requires
+rethinkdb running on localhost
+
+Copyright (C) 2017 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import brozzler
+import logging
+import argparse
+import rethinkstuff
+import time
+
+args = argparse.Namespace()
+args.log_level = logging.INFO
+brozzler.cli.configure_logging(args)
+
+def test_rethinkdb_up():
+    '''Checks that rethinkdb is listening and looks sane.'''
+    r = rethinkstuff.Rethinker(db='rethinkdb')  # built-in db
+    tbls = r.table_list().run()
+    assert len(tbls) > 10
+
+def test_resume_job():
+    '''
+    Tests that the right stuff gets twiddled in rethinkdb when we "start" and
+    "finish" crawling a job. Doesn't actually crawl anything.
+    '''
+    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
+    r = rethinkstuff.Rethinker(db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(r)
+    job_conf = {'seeds': [{'url': 'http://example.com/'}]}
+    job = brozzler.new_job(frontier, job_conf)
+    assert len(list(frontier.job_sites(job.id))) == 1
+    site = list(frontier.job_sites(job.id))[0]
+
+    assert job.status == 'ACTIVE'
+    assert len(job.starts_and_stops) == 1
+    assert job.starts_and_stops[0]['start']
+    assert job.starts_and_stops[0]['stop'] is None
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 1
+    assert site.starts_and_stops[0]['start']
+    assert site.starts_and_stops[0]['stop'] is None
+
+    frontier.finished(site, 'FINISHED')
+    job = frontier.job(job.id)
+
+    assert job.status == 'FINISHED'
+    assert len(job.starts_and_stops) == 1
+    assert job.starts_and_stops[0]['start']
+    assert job.starts_and_stops[0]['stop']
+    assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
+    assert site.status == 'FINISHED'
+    assert len(site.starts_and_stops) == 1
+    assert site.starts_and_stops[0]['start']
+    assert site.starts_and_stops[0]['stop']
+    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
+
+    frontier.resume_site(site)
+    job = frontier.job(job.id)
+
+    assert job.status == 'ACTIVE'
+    assert len(job.starts_and_stops) == 2
+    assert job.starts_and_stops[1]['start']
+    assert job.starts_and_stops[1]['stop'] is None
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 2
+    assert site.starts_and_stops[1]['start']
+    assert site.starts_and_stops[1]['stop'] is None
+
+    frontier.finished(site, 'FINISHED')
+    job = frontier.job(job.id)
+
+    assert job.status == 'FINISHED'
+    assert len(job.starts_and_stops) == 2
+    assert job.starts_and_stops[1]['start']
+    assert job.starts_and_stops[1]['stop']
+    assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
+    assert site.status == 'FINISHED'
+    assert len(site.starts_and_stops) == 2
+    assert site.starts_and_stops[1]['start']
+    assert site.starts_and_stops[1]['stop']
+    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
+
+    # resuming a job == resuming all of its sites
+    frontier.resume_job(job)
+    site = list(frontier.job_sites(job.id))[0]
+
+    assert job.status == 'ACTIVE'
+    assert len(job.starts_and_stops) == 3
+    assert job.starts_and_stops[2]['start']
+    assert job.starts_and_stops[2]['stop'] is None
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 3
+    assert site.starts_and_stops[2]['start']
+    assert site.starts_and_stops[2]['stop'] is None
+
+    frontier.finished(site, 'FINISHED')
+    job = frontier.job(job.id)
+
+    assert job.status == 'FINISHED'
+    assert len(job.starts_and_stops) == 3
+    assert job.starts_and_stops[2]['start']
+    assert job.starts_and_stops[2]['stop']
+    assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
+    assert site.status == 'FINISHED'
+    assert len(site.starts_and_stops) == 3
+    assert site.starts_and_stops[2]['start']
+    assert site.starts_and_stops[2]['stop']
+    assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
+
+def test_time_limit():
+    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
+    r = rethinkstuff.Rethinker('localhost', db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(r)
+    site = brozzler.Site(seed='http://example.com/', time_limit=99999)
+    brozzler.new_site(frontier, site)
+
+    site = frontier.site(site.id)  # get it back from the db
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 1
+    assert site.starts_and_stops[0]['start']
+    assert site.starts_and_stops[0]['stop'] is None
+
+    frontier.finished(site, 'FINISHED')
+
+    assert site.status == 'FINISHED'
+    assert len(site.starts_and_stops) == 1
+    assert site.starts_and_stops[0]['start']
+    assert site.starts_and_stops[0]['stop']
+    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
+
+    frontier.resume_site(site)
+
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 2
+    assert site.starts_and_stops[1]['start']
+    assert site.starts_and_stops[1]['stop'] is None
+
+    # time limit not reached yet
+    frontier._enforce_time_limit(site)
+
+    assert site.status == 'ACTIVE'
+    assert len(site.starts_and_stops) == 2
+    assert site.starts_and_stops[1]['start']
+    assert site.starts_and_stops[1]['stop'] is None
+
+    site.time_limit = 0.1
+    site.claimed = True
+    frontier.update_site(site)
+
+    time.sleep(0.1)
+    frontier._enforce_time_limit(site)
+
+    assert site.status == 'FINISHED_TIME_LIMIT'
+    assert not site.claimed
+    assert len(site.starts_and_stops) == 2
+    assert site.starts_and_stops[1]['start']
+    assert site.starts_and_stops[1]['stop']
+    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']