Merge branch 'qa' of github.com:internetarchive/brozzler into qa

This commit is contained in:
Barbara Miller 2017-02-05 21:49:40 -08:00
commit 0ff6257bef
16 changed files with 453 additions and 63 deletions

View File

@ -1,10 +1,10 @@
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
:target: https://travis-ci.org/internetarchive/brozzler
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
:width: 7%
|logo| brozzler
|logo| brozzler
===============
"browser" \| "crawler" = "brozzler"
@ -97,9 +97,8 @@ Job Configuration
-----------------
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional. For details, see
`<job-conf.rst>`_.
top-level or on individual seeds. At least one seed url must be specified,
everything else is optional. For details, see `<job-conf.rst>`_.
::
@ -139,6 +138,46 @@ To start the app, run
See ``brozzler-dashboard --help`` for configuration options.
Brozzler Wayback
----------------
Brozzler comes with a customized version of
`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
"captures" table (populated by warcprox) as its index.
To use, first install dependencies.
::
pip install brozzler[easy]
Write a configuration file pywb.yml.
::
# 'archive_paths' should point to the output directory of warcprox
archive_paths: warcs/ # pywb will fail without a trailing slash
collections:
brozzler:
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
db: brozzler
table: captures
servers:
- localhost
enable_auto_colls: false
enable_cdx_api: true
framed_replay: true
port: 8880
Run pywb like so:
::
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
Then browse http://localhost:8880/brozzler/.
Headless Chromium
-----------------
@ -198,7 +237,7 @@ option:
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
to load the plugin by adding this option to your wrapper script:
::
@ -208,7 +247,7 @@ to load the plugin by adding this option to your wrapper script:
License
-------
Copyright 2015-2016 Internet Archive
Copyright 2015-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); you may
not use this software except in compliance with the License. You may

View File

@ -64,19 +64,33 @@ class BaseDictable:
def __repr__(self):
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
def fixup(url):
def fixup(url, hash_strip=False):
'''
Does rudimentary canonicalization, such as converting IDN to punycode.
'''
import surt
hurl = surt.handyurl.parse(url)
if hash_strip:
hurl.hash = None
# handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
return hurl.getURLString()
# logging level more fine-grained than logging.DEBUG==10
# monkey-patch log level TRACE
TRACE = 5
import logging as _logging
def _logging_trace(msg, *args, **kwargs):
if len(_logging.root.handlers) == 0:
basicConfig()
_logging.root.trace(msg, *args, **kwargs)
def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(TRACE):
self._log(TRACE, msg, args, **kwargs)
_logging.trace = _logging_trace
_logging.Logger.trace = _logger_trace
_logging._levelToName[TRACE] = 'TRACE'
_logging._nameToLevel['TRACE'] = TRACE
_behaviors = None
def behaviors():

View File

@ -200,8 +200,7 @@ class WebsockReceiverThread(threading.Thread):
else:
self.logger.info(
'reached limit but self.reached_limit is already set, '
'assuming the calling thread is already handling this',
self.reached_limit)
'assuming the calling thread is already handling this')
if self.on_response:
self.on_response(message)

View File

@ -231,11 +231,11 @@ class Chrome:
b'CERT_PKIXVerifyCert for [^ ]* failed|'
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
logging.log(
brozzler.TRACE, 'chrome pid %s STDOUT %s',
self.logger.trace(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
logging.debug(
self.logger.debug(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
@ -246,15 +246,15 @@ class Chrome:
b'CERT_PKIXVerifyCert for [^ ]* failed|'
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
logging.log(
brozzler.TRACE, 'chrome pid %s STDOUT %s',
self.logger.trace(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
logging.debug(
self.logger.debug(
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
except:
logging.error('unexpected exception', exc_info=True)
self.logger.error('unexpected exception', exc_info=True)
def stop(self):
if not self.chrome_process or self._shutdown.is_set():

View File

@ -91,8 +91,7 @@ def _add_proxy_options(arg_parser):
def configure_logging(args):
logging.basicConfig(
stream=sys.stderr, level=args.log_level,
format=(
stream=sys.stderr, level=args.log_level, format=(
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARN)
@ -285,7 +284,7 @@ def brozzler_new_site():
args.behavior_parameters) if args.behavior_parameters else None,
username=args.username, password=args.password)
r = rethinker()
r = rethinker(args)
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)

View File

@ -149,6 +149,7 @@ class BrozzlerEasyController:
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
brozzler.pywb.monkey_patch_wburl()
if args.warcs_dir.endswith('/'):
warcs_dir = args.warcs_dir

View File

@ -1,7 +1,7 @@
'''
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -99,6 +99,7 @@ class RethinkDbFrontier:
if not job.id:
# only if "id" has not already been set
job.id = result["generated_keys"][0]
return job
def new_site(self, site):
self.logger.info("inserting into 'sites' table %s", site)
@ -119,7 +120,7 @@ class RethinkDbFrontier:
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
def update_page(self, page):
self.logger.debug("updating 'pages' table entry %s", page)
self.logger.trace("updating 'pages' table entry %s", page)
result = self.r.table("pages").get(page.id).replace(page.to_dict()).run()
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
@ -176,9 +177,10 @@ class RethinkDbFrontier:
def _enforce_time_limit(self, site):
if (site.time_limit and site.time_limit > 0
and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
and site.elapsed() > site.time_limit):
self.logger.debug(
"site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
site.time_limit, site.elapsed(), site)
self.finished(site, "FINISHED_TIME_LIMIT")
return True
else:
@ -276,14 +278,16 @@ class RethinkDbFrontier:
n += 1
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
job.status = "FINISHED"
job.finished = rethinkstuff.utcnow()
job.finish()
self.update_job(job)
return True
def finished(self, site, status):
self.logger.info("%s %s", status, site)
site.status = status
site.claimed = False
site.last_disclaimed = rethinkstuff.utcnow()
site.starts_and_stops[-1]["stop"] = rethinkstuff.utcnow()
self.update_site(site)
if site.job_id:
self._maybe_finish_job(site.job_id)
@ -300,6 +304,30 @@ class RethinkDbFrontier:
page.claimed = False
self.update_page(page)
def resume_job(self, job):
job.status = "ACTIVE"
job.starts_and_stops.append(
{"start":rethinkstuff.utcnow(), "stop":None})
self.update_job(job)
for site in self.job_sites(job.id):
site.status = "ACTIVE"
site.starts_and_stops.append(
{"start":rethinkstuff.utcnow(), "stop":None})
self.update_site(site)
def resume_site(self, site):
if site.job_id:
# can't call resume_job since that would resume jobs's other sites
job = self.job(site.job_id)
job.status = "ACTIVE"
job.starts_and_stops.append(
{"start":rethinkstuff.utcnow(), "stop":None})
self.update_job(job)
site.status = "ACTIVE"
site.starts_and_stops.append(
{"start":rethinkstuff.utcnow(), "stop":None})
self.update_site(site)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
if site.remember_outlinks:
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}

View File

@ -2,7 +2,7 @@
brozzler/job.py - Job class representing a brozzler crawl job, and functions
for setting up a job with supplied configuration
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -61,17 +61,22 @@ def merge(a, b):
return a
def new_job_file(frontier, job_conf_file):
'''Returns new Job.'''
logging.info("loading %s", job_conf_file)
with open(job_conf_file) as f:
job_conf = yaml.load(f)
new_job(frontier, job_conf)
return new_job(frontier, job_conf)
def new_job(frontier, job_conf):
'''Returns new Job.'''
validate_conf(job_conf)
job = Job(
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
started=rethinkstuff.utcnow())
# insert the job now to make sure it has an id
job = frontier.new_job(job)
sites = []
for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf)
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
password=merged_conf.get("password"))
sites.append(site)
# insert all the sites into database before the job
for site in sites:
new_site(frontier, site)
frontier.new_job(job)
return job
def new_site(frontier, site):
site.id = str(uuid.uuid4())
@ -120,14 +124,29 @@ def new_site(frontier, site):
class Job(brozzler.BaseDictable):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, id=None, conf=None, status="ACTIVE", started=None,
finished=None, stop_requested=None):
def __init__(
self, id=None, conf=None, status="ACTIVE", started=None,
finished=None, stop_requested=None, starts_and_stops=None):
self.id = id
self.conf = conf
self.status = status
self.started = started
self.finished = finished
self.stop_requested = stop_requested
self.starts_and_stops = starts_and_stops
if not self.starts_and_stops:
if started: # backward compatibility
self.starts_and_stops = [{"start":started,"stop":finished}]
else:
self.starts_and_stops = [
{"start":rethinkstuff.utcnow(),"stop":None}]
def finish(self):
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
self.logger.error(
"job is already finished status=%s "
"starts_and_stops[-1]['stop']=%s", self.status,
self.starts_and_stops[-1]["stop"])
self.status = "FINISHED"
self.starts_and_stops[-1]["stop"] = rethinkstuff.utcnow()
def __str__(self):
return 'Job(id=%s)' % self.id

View File

@ -2,7 +2,7 @@ id:
type:
- string
- integer
required: true
required: false
<<: &multi_level_options
time_limit:

View File

@ -1,9 +1,9 @@
'''
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, and canonicalization rules matching
brozzler conventions
loading from warcs still being written to, canonicalization rules matching
brozzler conventions, support for screenshot: and thumbnail: urls
Copyright (C) 2016 Internet Archive
Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -26,6 +26,8 @@ try:
import pywb.cdx.cdxobject
import pywb.cdx.cdxserver
import pywb.webapp.query_handler
import pywb.framework.basehandlers
import pywb.rewrite.wburl
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
@ -37,6 +39,7 @@ import rethinkdb
import surt
import json
import brozzler
import argparse
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table):
@ -65,7 +68,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
'url': record['url'],
'status': str(record['response_code']),
'digest': record['sha1base32'],
'length': str(record['record_length']),
'length': str(record.get('record_length', '-')),
'offset': str(record['offset']),
'filename': record['filename'],
}
@ -120,8 +123,7 @@ class TheGoodUrlCanonicalizer(object):
# logging.debug('%s -> %s', url, key)
return key
except Exception as e:
raise pywb.utils.canonicalize.UrlCanonicalizeException(
'Invalid Url: ' + url)
return url
def replace_default_canonicalizer():
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
@ -193,11 +195,90 @@ def support_in_progress_warcs():
return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
def __init__(self, orig_url):
import re
import six
from six.moves.urllib.parse import urlsplit, urlunsplit
from six.moves.urllib.parse import quote_plus, quote, unquote_plus
from pywb.utils.loaders import to_native_str
from pywb.rewrite.wburl import WbUrl
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
if six.PY2 and isinstance(orig_url, six.text_type):
orig_url = orig_url.encode('utf-8')
orig_url = quote(orig_url)
self._original_url = orig_url
if not self._init_query(orig_url):
if not self._init_replay(orig_url):
raise Exception('Invalid WbUrl: ', orig_url)
new_uri = WbUrl.to_uri(self.url)
self._do_percent_encode = True
self.url = new_uri
# begin brozzler changes
if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
or self.url.startswith('thumbnail:')):
return
# end brozzler changes
# protocol agnostic url -> http://
# no protocol -> http://
#inx = self.url.find('://')
inx = -1
m = self.SCHEME_RX.match(self.url)
if m:
inx = m.span(1)[0]
#if inx < 0:
# check for other partially encoded variants
# m = self.PARTIAL_ENC_RX.match(self.url)
# if m:
# len_ = len(m.group(0))
# self.url = (urllib.unquote_plus(self.url[:len_]) +
# self.url[len_:])
# inx = self.url.find(':/')
if inx < 0:
self.url = self.DEFAULT_SCHEME + self.url
else:
inx += 2
if inx < len(self.url) and self.url[inx] != '/':
self.url = self.url[:inx] + '/' + self.url[inx:]
def _get_wburl_type(self):
return SomeWbUrl
def monkey_patch_wburl():
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
def _extend_parser(self, arg_parser):
super()._extend_parser(arg_parser)
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
arg_parser.epilog = '''
Run pywb like so:
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
See README.rst for more information.
'''
def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
wayback_cli = pywb.apps.cli.WaybackCli(
brozzler.pywb.monkey_patch_wburl()
wayback_cli = BrozzlerWaybackCli(
args=argv[1:], default_port=8880,
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
'with brozzler)'))

View File

@ -99,7 +99,7 @@ class Site(brozzler.BaseDictable):
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
cookie_db=None, user_agent=None, behavior_parameters=None,
username=None, password=None):
username=None, password=None, starts_and_stops=None):
self.seed = seed
self.id = id
@ -113,7 +113,6 @@ class Site(brozzler.BaseDictable):
self.status = status
self.claimed = bool(claimed)
self.last_claimed_by = last_claimed_by
self.start_time = start_time or rethinkstuff.utcnow()
self.last_disclaimed = last_disclaimed
self.last_claimed = last_claimed
self.metadata = metadata
@ -123,11 +122,32 @@ class Site(brozzler.BaseDictable):
self.behavior_parameters = behavior_parameters
self.username = username
self.password = password
self.starts_and_stops = starts_and_stops
if not self.starts_and_stops:
if start_time: # backward compatibility
self.starts_and_stops = [{"start":start_time,"stop":None}]
if self.status != "ACTIVE":
self.starts_and_stops[0]["stop"] = self.last_disclaimed
else:
self.starts_and_stops = [
{"start":rethinkstuff.utcnow(),"stop":None}]
self.scope = scope or {}
if not "surt" in self.scope:
self.scope["surt"] = Url(seed).surt
def elapsed(self):
'''Returns elapsed crawl time as a float in seconds.'''
dt = 0
for ss in self.starts_and_stops[:-1]:
dt += (ss['stop'] - ss['start']).total_seconds()
ss = self.starts_and_stops[-1]
if ss['stop']:
dt += (ss['stop'] - ss['start']).total_seconds()
else: # crawl is active
dt += (rethinkstuff.utcnow() - ss['start']).total_seconds()
return dt
def __str__(self):
return "Site-%s-%s" % (self.id, self.seed)

View File

@ -237,17 +237,20 @@ class BrozzlerWorker:
if on_screenshot:
on_screenshot(screenshot_png)
elif self._proxy(site) and self._enable_warcprox_features(site):
self.logger.info("sending WARCPROX_WRITE_RECORD request "
"to warcprox with screenshot for %s", page)
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy(site), page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
screenshot_png)
self._warcprox_write_record(warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url),
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url, True),
warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers())
self._warcprox_write_record(warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url),
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url, True),
warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers())
@ -380,9 +383,8 @@ class BrozzlerWorker:
try:
self.status_info = self._service_registry.heartbeat(status_info)
self.logger.log(
brozzler.TRACE, "status in service registry: %s",
self.status_info)
self.logger.trace(
"status in service registry: %s", self.status_info)
except rethinkdb.ReqlError as e:
self.logger.error(
"failed to send heartbeat and update service registry "

View File

@ -2,8 +2,8 @@ brozzler job configuration
**************************
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional.
top-level or on individual seeds. At least one seed url must be specified,
everything else is optional.
an example
==========
@ -85,11 +85,11 @@ settings reference
id
--
+-----------+--------+----------+---------+
| scope | type | required | default |
+===========+========+==========+=========+
| top-level | string | yes? | *n/a* |
+-----------+--------+----------+---------+
+-----------+--------+----------+--------------------------+
| scope | type | required | default |
+===========+========+==========+==========================+
| top-level | string | no | *generated by rethinkdb* |
+-----------+--------+----------+--------------------------+
An arbitrary identifier for this job. Must be unique across this deployment of
brozzler.

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev176',
version='1.1b9.dev185',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -155,6 +155,20 @@ def test_brozzle_site(httpd):
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
url = 'screenshot:%s' % page1
t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
response = requests.get(wb_url)
assert response.status_code == 200
assert response.headers['content-type'] == 'image/jpeg'
url = 'thumbnail:%s' % page1
t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
response = requests.get(wb_url)
assert response.status_code == 200
assert response.headers['content-type'] == 'image/jpeg'
def test_warcprox_selection(httpd):
''' When enable_warcprox_features is true, brozzler is expected to choose
and instance of warcprox '''

174
tests/test_frontier.py Normal file
View File

@ -0,0 +1,174 @@
#!/usr/bin/env python
'''
test_frontier.py - fairly narrow tests of frontier management, requires
rethinkdb running on localhost
Copyright (C) 2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import brozzler
import logging
import argparse
import rethinkstuff
import time
args = argparse.Namespace()
args.log_level = logging.INFO
brozzler.cli.configure_logging(args)
def test_rethinkdb_up():
'''Checks that rethinkdb is listening and looks sane.'''
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
tbls = r.table_list().run()
assert len(tbls) > 10
def test_resume_job():
'''
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
"finish" crawling a job. Doesn't actually crawl anything.
'''
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
r = rethinkstuff.Rethinker(db='ignoreme')
frontier = brozzler.RethinkDbFrontier(r)
job_conf = {'seeds': [{'url': 'http://example.com/'}]}
job = brozzler.new_job(frontier, job_conf)
assert len(list(frontier.job_sites(job.id))) == 1
site = list(frontier.job_sites(job.id))[0]
assert job.status == 'ACTIVE'
assert len(job.starts_and_stops) == 1
assert job.starts_and_stops[0]['start']
assert job.starts_and_stops[0]['stop'] is None
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start']
assert site.starts_and_stops[0]['stop'] is None
frontier.finished(site, 'FINISHED')
job = frontier.job(job.id)
assert job.status == 'FINISHED'
assert len(job.starts_and_stops) == 1
assert job.starts_and_stops[0]['start']
assert job.starts_and_stops[0]['stop']
assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
assert site.status == 'FINISHED'
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start']
assert site.starts_and_stops[0]['stop']
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
frontier.resume_site(site)
job = frontier.job(job.id)
assert job.status == 'ACTIVE'
assert len(job.starts_and_stops) == 2
assert job.starts_and_stops[1]['start']
assert job.starts_and_stops[1]['stop'] is None
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 2
assert site.starts_and_stops[1]['start']
assert site.starts_and_stops[1]['stop'] is None
frontier.finished(site, 'FINISHED')
job = frontier.job(job.id)
assert job.status == 'FINISHED'
assert len(job.starts_and_stops) == 2
assert job.starts_and_stops[1]['start']
assert job.starts_and_stops[1]['stop']
assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
assert site.status == 'FINISHED'
assert len(site.starts_and_stops) == 2
assert site.starts_and_stops[1]['start']
assert site.starts_and_stops[1]['stop']
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
# resuming a job == resuming all of its sites
frontier.resume_job(job)
site = list(frontier.job_sites(job.id))[0]
assert job.status == 'ACTIVE'
assert len(job.starts_and_stops) == 3
assert job.starts_and_stops[2]['start']
assert job.starts_and_stops[2]['stop'] is None
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 3
assert site.starts_and_stops[2]['start']
assert site.starts_and_stops[2]['stop'] is None
frontier.finished(site, 'FINISHED')
job = frontier.job(job.id)
assert job.status == 'FINISHED'
assert len(job.starts_and_stops) == 3
assert job.starts_and_stops[2]['start']
assert job.starts_and_stops[2]['stop']
assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
assert site.status == 'FINISHED'
assert len(site.starts_and_stops) == 3
assert site.starts_and_stops[2]['start']
assert site.starts_and_stops[2]['stop']
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
def test_time_limit():
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
r = rethinkstuff.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(r)
site = brozzler.Site(seed='http://example.com/', time_limit=99999)
brozzler.new_site(frontier, site)
site = frontier.site(site.id) # get it back from the db
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start']
assert site.starts_and_stops[0]['stop'] is None
frontier.finished(site, 'FINISHED')
assert site.status == 'FINISHED'
assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]['start']
assert site.starts_and_stops[0]['stop']
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
frontier.resume_site(site)
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 2
assert site.starts_and_stops[1]['start']
assert site.starts_and_stops[1]['stop'] is None
# time limit not reached yet
frontier._enforce_time_limit(site)
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 2
assert site.starts_and_stops[1]['start']
assert site.starts_and_stops[1]['stop'] is None
site.time_limit = 0.1
site.claimed = True
frontier.update_site(site)
time.sleep(0.1)
frontier._enforce_time_limit(site)
assert site.status == 'FINISHED_TIME_LIMIT'
assert not site.claimed
assert len(site.starts_and_stops) == 2
assert site.starts_and_stops[1]['start']
assert site.starts_and_stops[1]['stop']
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']