fix bug where the first time a site was claimed, another brozzler-worker would claim it anyway (and find no pages to brozzle)

This commit is contained in:
Noah Levitt 2016-04-21 00:21:08 +00:00
parent 416aa064f8
commit 4bbbbcf138
2 changed files with 19 additions and 12 deletions

View file

@ -99,19 +99,23 @@ class RethinkDbFrontier:
.order_by(index="sites_last_disclaimed") .order_by(index="sites_last_disclaimed")
.filter( .filter(
(rethinkdb.row["claimed"] != True) | (rethinkdb.row["claimed"] != True) |
(rethinkdb.row["last_disclaimed"] (rethinkdb.row["last_claimed"]
< rethinkdb.now() - 2*60*60)) < rethinkdb.now() - 2*60*60))
.limit(1) .limit(1)
.update({"claimed":True,"last_claimed_by":worker_id}, .update({
return_changes=True)).run() "claimed": True,
"last_claimed_by": worker_id,
"last_claimed": rethinkstuff.utcnow(),
}, return_changes=True)).run()
self._vet_result(result, replaced=[0,1], unchanged=[0,1]) self._vet_result(result, replaced=[0,1], unchanged=[0,1])
if result["replaced"] == 1: if result["replaced"] == 1:
if result["changes"][0]["old_val"]["claimed"]: if result["changes"][0]["old_val"]["claimed"]:
self.logger.warn( self.logger.warn(
"re-claimed site that was still marked 'claimed' " "re-claimed site that was still marked 'claimed' "
"because it was last disclaimed a long time ago " "because it was last claimed a long time ago "
"at %s", "at %s, and presumably some error stopped it from "
result["changes"][0]["old_val"]["last_disclaimed"]) "being disclaimed",
result["changes"][0]["old_val"]["last_claimed"])
site = brozzler.Site(**result["changes"][0]["new_val"]) site = brozzler.Site(**result["changes"][0]["new_val"])
else: else:
raise brozzler.NothingToClaim raise brozzler.NothingToClaim

View file

@ -12,11 +12,13 @@ _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff
class Site(brozzler.BaseDictable): class Site(brozzler.BaseDictable):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None, job_id=None, scope=None, proxy=None, def __init__(
ignore_robots=False, time_limit=None, extra_headers=None, self, seed, id=None, job_id=None, scope=None, proxy=None,
enable_warcprox_features=False, reached_limit=None, status="ACTIVE", ignore_robots=False, time_limit=None, extra_headers=None,
claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, enable_warcprox_features=False, reached_limit=None,
last_claimed_by=None): status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC):
self.seed = seed self.seed = seed
self.id = id self.id = id
@ -32,6 +34,7 @@ class Site(brozzler.BaseDictable):
self.last_claimed_by = last_claimed_by self.last_claimed_by = last_claimed_by
self.start_time = start_time or rethinkstuff.utcnow() self.start_time = start_time or rethinkstuff.utcnow()
self.last_disclaimed = last_disclaimed self.last_disclaimed = last_disclaimed
self.last_claimed = last_claimed
self.scope = scope or {} self.scope = scope or {}
if not "surt" in self.scope: if not "surt" in self.scope:
@ -44,7 +47,7 @@ class Site(brozzler.BaseDictable):
self.ignore_robots, self.extra_headers, self.reached_limit) self.ignore_robots, self.extra_headers, self.reached_limit)
def __str__(self): def __str__(self):
return "site-%s-%s" % (self.id, self.seed) return "Site-%s-%s" % (self.id, self.seed)
def _to_surt(self, url): def _to_surt(self, url):
hurl = surt.handyurl.parse(url) hurl = surt.handyurl.parse(url)