three-value "brozzled" parameter for frontier.site_pages(); fix thing where every Site got a list of all the seeds from the job; and some more frontier tests to catch these kinds of things

This commit is contained in:
Noah Levitt 2017-03-20 17:28:16 -07:00
parent 0e9f4a0c26
commit eeee523b18
6 changed files with 135 additions and 9 deletions

View file

@ -23,6 +23,7 @@ import logging
import argparse
import doublethink
import time
import datetime
args = argparse.Namespace()
args.log_level = logging.INFO
@ -34,6 +35,119 @@ def test_rethinkdb_up():
tbls = rr.table_list().run()
assert len(tbls) > 10
def test_basics():
rr = doublethink.Rethinker(db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
job_conf = {'seeds': [
{'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
job = brozzler.new_job(frontier, job_conf)
assert job.id
assert job.starts_and_stops
assert job.starts_and_stops[0]['start']
assert job == {
'id': job.id,
'conf': {
'seeds': [
{'url': 'http://example.com'},
{'url': 'https://example.org/'}
]
},
'status': 'ACTIVE',
'starts_and_stops': [
{
'start': job.starts_and_stops[0]['start'],
'stop': None
}
]
}
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
assert len(sites) == 2
assert sites[0].starts_and_stops[0]['start']
assert sites[1].starts_and_stops[0]['start']
assert sites[0] == {
'claimed': False,
'enable_warcprox_features': False,
'id': sites[0].id,
'job_id': job.id,
'last_claimed': brozzler.EPOCH_UTC,
'last_disclaimed': brozzler.EPOCH_UTC,
'scope': {
'surt': 'http://(com,example,)/'
},
'seed': 'http://example.com',
'starts_and_stops': [
{
'start': sites[0].starts_and_stops[0]['start'],
'stop': None
}
],
'status': 'ACTIVE'
}
assert sites[1] == {
'claimed': False,
'enable_warcprox_features': False,
'id': sites[1].id,
'job_id': job.id,
'last_claimed': brozzler.EPOCH_UTC,
'last_disclaimed': brozzler.EPOCH_UTC,
'scope': {
'surt': 'https://(org,example,)/',
},
'seed': 'https://example.org/',
'starts_and_stops': [
{
'start': sites[1].starts_and_stops[0]['start'],
'stop': None,
},
],
'status': 'ACTIVE',
}
pages = list(frontier.site_pages(sites[0].id))
assert len(pages) == 1
assert pages[0] == {
'brozzle_count': 0,
'claimed': False,
'hops_from_seed': 0,
'hops_off_surt': 0,
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
'job_id': job.id,
'needs_robots_check': True,
'priority': 1000,
'site_id': sites[0].id,
'url': 'http://example.com',
}
pages = list(frontier.site_pages(sites[1].id))
assert len(pages) == 1
assert pages[0] == {
'brozzle_count': 0,
'claimed': False,
'hops_from_seed': 0,
'hops_off_surt': 0,
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
'job_id': job.id,
'needs_robots_check': True,
'priority': 1000,
'site_id': sites[1].id,
'url': 'https://example.org/',
}
# test "brozzled" parameter of frontier.site_pages
assert len(list(frontier.site_pages(sites[1].id))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
pages[0].brozzle_count = 1
pages[0].save()
assert len(list(frontier.site_pages(sites[1].id))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
pages[0].brozzle_count = 32819
pages[0].save()
assert len(list(frontier.site_pages(sites[1].id))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
def test_resume_job():
'''
Tests that the right stuff gets twiddled in rethinkdb when we "start" and