new job setting max_claimed_sites

Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs. For AITFIVE-1578.
This commit is contained in:
Noah Levitt 2018-03-01 17:17:54 -08:00
parent d7512fbeb6
commit f26d711a89
5 changed files with 165 additions and 42 deletions

View file

@ -863,6 +863,46 @@ def test_claim_site():
# clean up
rr.table('sites').get(claimed_site.id).delete().run()
def test_max_claimed_sites():
# max_claimed_sites is a brozzler job setting that puts a cap on the number
# of the job's sites that can be brozzled simultaneously across the cluster
rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
# clean slate
rr.table('jobs').delete().run()
rr.table('sites').delete().run()
job_conf = {
'seeds': [
{'url': 'http://example.com/1'},
{'url': 'http://example.com/2'},
{'url': 'http://example.com/3'},
{'url': 'http://example.com/4'},
{'url': 'http://example.com/5'},
],
'max_claimed_sites': 3,
}
job = brozzler.new_job(frontier, job_conf)
assert job.id
assert job.max_claimed_sites == 3
sites = list(frontier.job_sites(job.id))
assert len(sites) == 5
claimed_sites = frontier.claim_sites(1)
assert len(claimed_sites) == 1
claimed_sites = frontier.claim_sites(3)
assert len(claimed_sites) == 2
with pytest.raises(brozzler.NothingToClaim):
claimed_site = frontier.claim_sites(3)
# clean slate for the next one
rr.table('jobs').delete().run()
rr.table('sites').delete().run()
def test_choose_warcprox():
rr = doublethink.Rethinker('localhost', db='ignoreme')
svcreg = doublethink.ServiceRegistry(rr)