mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
re-claim sites after 1 hour instead of 2 so that sites don't have to wait as long to be brozzled again in case of kill -9 brozzler-worker
This commit is contained in:
parent
000d40c4dc
commit
52433ade78
@ -101,14 +101,14 @@ class RethinkDbFrontier:
|
|||||||
index="sites_last_disclaimed")
|
index="sites_last_disclaimed")
|
||||||
.order_by(index="sites_last_disclaimed")
|
.order_by(index="sites_last_disclaimed")
|
||||||
.filter((r.row["claimed"] != True) | (
|
.filter((r.row["claimed"] != True) | (
|
||||||
r.row["last_claimed"] < r.now() - 2*60*60))
|
r.row["last_claimed"] < r.now() - 60*60))
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.update(
|
.update(
|
||||||
# try to avoid a race condition resulting in multiple
|
# try to avoid a race condition resulting in multiple
|
||||||
# brozzler-workers claiming the same site
|
# brozzler-workers claiming the same site
|
||||||
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
|
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
|
||||||
r.branch((r.row["claimed"] != True) | (
|
r.branch((r.row["claimed"] != True) | (
|
||||||
r.row["last_claimed"] < r.now() - 2*60*60), {
|
r.row["last_claimed"] < r.now() - 60*60), {
|
||||||
"claimed": True, "last_claimed_by": worker_id,
|
"claimed": True, "last_claimed_by": worker_id,
|
||||||
"last_claimed": doublethink.utcnow()}, {}),
|
"last_claimed": doublethink.utcnow()}, {}),
|
||||||
return_changes=True)).run()
|
return_changes=True)).run()
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b11.dev240',
|
version='1.1b11.dev241',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -698,3 +698,39 @@ def test_honor_stop_request():
|
|||||||
with pytest.raises(brozzler.CrawlStopped):
|
with pytest.raises(brozzler.CrawlStopped):
|
||||||
frontier.honor_stop_request(site)
|
frontier.honor_stop_request(site)
|
||||||
|
|
||||||
|
def test_claim_site():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
rr.table('sites').delete().run() # clean slate
|
||||||
|
|
||||||
|
with pytest.raises(brozzler.NothingToClaim):
|
||||||
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||||
|
|
||||||
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||||
|
assert claimed_site.id == site.id
|
||||||
|
assert claimed_site.claimed
|
||||||
|
assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1)
|
||||||
|
with pytest.raises(brozzler.NothingToClaim):
|
||||||
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||||
|
|
||||||
|
# site last_claimed less than 1 hour ago still not to be reclaimed
|
||||||
|
claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55)
|
||||||
|
claimed_site.save()
|
||||||
|
with pytest.raises(brozzler.NothingToClaim):
|
||||||
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||||
|
|
||||||
|
# site last_claimed more than 1 hour ago can be reclaimed
|
||||||
|
site = claimed_site
|
||||||
|
claimed_site = None
|
||||||
|
site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
|
||||||
|
site.save()
|
||||||
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||||
|
assert claimed_site.id == site.id
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
rr.table('sites').get(claimed_site.id).delete().run()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user