diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 54b8e07..00ad46e 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -101,14 +101,14 @@ class RethinkDbFrontier: index="sites_last_disclaimed") .order_by(index="sites_last_disclaimed") .filter((r.row["claimed"] != True) | ( - r.row["last_claimed"] < r.now() - 2*60*60)) + r.row["last_claimed"] < r.now() - 60*60)) .limit(1) .update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 r.branch((r.row["claimed"] != True) | ( - r.row["last_claimed"] < r.now() - 2*60*60), { + r.row["last_claimed"] < r.now() - 60*60), { "claimed": True, "last_claimed_by": worker_id, "last_claimed": doublethink.utcnow()}, {}), return_changes=True)).run() diff --git a/setup.py b/setup.py index a11585d..bc52352 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b11.dev240', + version='1.1b11.dev241', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_frontier.py b/tests/test_frontier.py index a8c476b..f4fcec3 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -698,3 +698,39 @@ def test_honor_stop_request(): with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site) +def test_claim_site(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + + rr.table('sites').delete().run() # clean slate + + with pytest.raises(brozzler.NothingToClaim): + claimed_site = frontier.claim_site(worker_id='test_claim_site') + + site = brozzler.Site(rr, {'seed': 'http://example.org/'}) + brozzler.new_site(frontier, site) + + claimed_site = frontier.claim_site(worker_id='test_claim_site') + assert claimed_site.id == site.id + assert claimed_site.claimed + assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1) + with pytest.raises(brozzler.NothingToClaim): + claimed_site = frontier.claim_site(worker_id='test_claim_site') + + # site last_claimed less than 1 hour ago still not to be reclaimed + claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55) + claimed_site.save() + with pytest.raises(brozzler.NothingToClaim): + claimed_site = frontier.claim_site(worker_id='test_claim_site') + + # site last_claimed more than 1 hour ago can be reclaimed + site = claimed_site + claimed_site = None + site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) + site.save() + claimed_site = frontier.claim_site(worker_id='test_claim_site') + assert claimed_site.id == site.id + + # clean up + rr.table('sites').get(claimed_site.id).delete().run() +