fix bug where the first time a site was claimed, another brozzler-worker would claim it anyway (and find no pages to brozzle)

This commit is contained in:
Noah Levitt 2016-04-21 00:21:08 +00:00
parent 416aa064f8
commit 4bbbbcf138
2 changed files with 19 additions and 12 deletions

View File

@ -99,19 +99,23 @@ class RethinkDbFrontier:
.order_by(index="sites_last_disclaimed")
.filter(
(rethinkdb.row["claimed"] != True) |
(rethinkdb.row["last_disclaimed"]
(rethinkdb.row["last_claimed"]
< rethinkdb.now() - 2*60*60))
.limit(1)
.update({"claimed":True,"last_claimed_by":worker_id},
return_changes=True)).run()
.update({
"claimed": True,
"last_claimed_by": worker_id,
"last_claimed": rethinkstuff.utcnow(),
}, return_changes=True)).run()
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
if result["replaced"] == 1:
if result["changes"][0]["old_val"]["claimed"]:
self.logger.warn(
"re-claimed site that was still marked 'claimed' "
"because it was last disclaimed a long time ago "
"at %s",
result["changes"][0]["old_val"]["last_disclaimed"])
"because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from "
"being disclaimed",
result["changes"][0]["old_val"]["last_claimed"])
site = brozzler.Site(**result["changes"][0]["new_val"])
else:
raise brozzler.NothingToClaim

View File

@ -12,11 +12,13 @@ _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff
class Site(brozzler.BaseDictable):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None, job_id=None, scope=None, proxy=None,
ignore_robots=False, time_limit=None, extra_headers=None,
enable_warcprox_features=False, reached_limit=None, status="ACTIVE",
claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC,
last_claimed_by=None):
def __init__(
self, seed, id=None, job_id=None, scope=None, proxy=None,
ignore_robots=False, time_limit=None, extra_headers=None,
enable_warcprox_features=False, reached_limit=None,
status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC):
self.seed = seed
self.id = id
@ -32,6 +34,7 @@ class Site(brozzler.BaseDictable):
self.last_claimed_by = last_claimed_by
self.start_time = start_time or rethinkstuff.utcnow()
self.last_disclaimed = last_disclaimed
self.last_claimed = last_claimed
self.scope = scope or {}
if not "surt" in self.scope:
@ -44,7 +47,7 @@ class Site(brozzler.BaseDictable):
self.ignore_robots, self.extra_headers, self.reached_limit)
def __str__(self):
return "site-%s-%s" % (self.id, self.seed)
return "Site-%s-%s" % (self.id, self.seed)
def _to_surt(self, url):
hurl = surt.handyurl.parse(url)