mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
175 lines
6.1 KiB
Python
175 lines
6.1 KiB
Python
#!/usr/bin/env python
|
|
'''
|
|
test_frontier.py - fairly narrow tests of frontier management, requires
|
|
rethinkdb running on localhost
|
|
|
|
Copyright (C) 2017 Internet Archive
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
'''
|
|
|
|
import brozzler
|
|
import logging
|
|
import argparse
|
|
import doublethink
|
|
import time
|
|
|
|
args = argparse.Namespace()
|
|
args.log_level = logging.INFO
|
|
brozzler.cli.configure_logging(args)
|
|
|
|
def test_rethinkdb_up():
|
|
'''Checks that rethinkdb is listening and looks sane.'''
|
|
rr = doublethink.Rethinker(db='rethinkdb') # built-in db
|
|
tbls = rr.table_list().run()
|
|
assert len(tbls) > 10
|
|
|
|
def test_resume_job():
|
|
'''
|
|
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
|
"finish" crawling a job. Doesn't actually crawl anything.
|
|
'''
|
|
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
|
rr = doublethink.Rethinker(db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
job_conf = {'seeds': [{'url': 'http://example.com/'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert len(list(frontier.job_sites(job.id))) == 1
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop']
|
|
assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop']
|
|
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
frontier.resume_site(site)
|
|
job.refresh()
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop']
|
|
assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop']
|
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
# resuming a job == resuming all of its sites
|
|
frontier.resume_job(job)
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 3
|
|
assert job.starts_and_stops[2]['start']
|
|
assert job.starts_and_stops[2]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 3
|
|
assert site.starts_and_stops[2]['start']
|
|
assert site.starts_and_stops[2]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 3
|
|
assert job.starts_and_stops[2]['start']
|
|
assert job.starts_and_stops[2]['stop']
|
|
assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 3
|
|
assert site.starts_and_stops[2]['start']
|
|
assert site.starts_and_stops[2]['stop']
|
|
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
def test_time_limit():
|
|
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
site.refresh() # get it back from the db
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop']
|
|
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
frontier.resume_site(site)
|
|
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
# time limit not reached yet
|
|
frontier._enforce_time_limit(site)
|
|
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
site.time_limit = 0.1
|
|
site.claimed = True
|
|
site.save()
|
|
|
|
time.sleep(0.1)
|
|
frontier._enforce_time_limit(site)
|
|
|
|
assert site.status == 'FINISHED_TIME_LIMIT'
|
|
assert not site.claimed
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop']
|
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|