mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 16:16:28 -04:00
Merge branch 'master' into qa
* master: disable the re-claiming of sites that are marked claimed from more than an hour ago, because sometimes pages legitimately take longer than an hour to brozzle; working on a better solution to this issue
This commit is contained in:
commit
d04a3f4f2b
@ -102,21 +102,28 @@ class RethinkDbFrontier:
|
||||
["ACTIVE", r.minval], ["ACTIVE", r.maxval],
|
||||
index="sites_last_disclaimed")
|
||||
.order_by(index="sites_last_disclaimed")
|
||||
.filter((r.row["claimed"] != True) | (
|
||||
r.row["last_claimed"] < r.now() - 60*60))
|
||||
.filter(r.row["claimed"] != True)
|
||||
# XXX
|
||||
# .filter((r.row["claimed"] != True) | (
|
||||
# r.row["last_claimed"] < r.now() - 60*60))
|
||||
.limit(1)
|
||||
.update(
|
||||
# try to avoid a race condition resulting in multiple
|
||||
# brozzler-workers claiming the same site
|
||||
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
|
||||
r.branch((r.row["claimed"] != True) | (
|
||||
r.row["last_claimed"] < r.now() - 60*60), {
|
||||
r.branch(r.row["claimed"] != True, {
|
||||
"claimed": True, "last_claimed_by": worker_id,
|
||||
"last_claimed": doublethink.utcnow()}, {}),
|
||||
# XXX
|
||||
# r.branch((r.row["claimed"] != True) | (
|
||||
# r.row["last_claimed"] < r.now() - 60*60), {
|
||||
# "claimed": True, "last_claimed_by": worker_id,
|
||||
# "last_claimed": doublethink.utcnow()}, {}),
|
||||
return_changes=True)).run()
|
||||
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
|
||||
if result["replaced"] == 1:
|
||||
if result["changes"][0]["old_val"]["claimed"]:
|
||||
# XXX impossible at the moment
|
||||
self.logger.warn(
|
||||
"re-claimed site that was still marked 'claimed' "
|
||||
"because it was last claimed a long time ago "
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b12.dev257',
|
||||
version='1.1b12.dev258',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -723,14 +723,17 @@ def test_claim_site():
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||
|
||||
# site last_claimed more than 1 hour ago can be reclaimed
|
||||
### temporarily changing this behavior
|
||||
### # site last_claimed more than 1 hour ago can be reclaimed
|
||||
site = claimed_site
|
||||
claimed_site = None
|
||||
site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
|
||||
site.save()
|
||||
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||
assert claimed_site.id == site.id
|
||||
### claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||
### assert claimed_site.id == site.id
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
||||
|
||||
# clean up
|
||||
rr.table('sites').get(claimed_site.id).delete().run()
|
||||
rr.table('sites').get(site.id).delete().run()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user