diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 9f06115..a634a58 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -99,19 +99,23 @@ class RethinkDbFrontier: .order_by(index="sites_last_disclaimed") .filter( (rethinkdb.row["claimed"] != True) | - (rethinkdb.row["last_disclaimed"] + (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2*60*60)) .limit(1) - .update({"claimed":True,"last_claimed_by":worker_id}, - return_changes=True)).run() + .update({ + "claimed": True, + "last_claimed_by": worker_id, + "last_claimed": rethinkstuff.utcnow(), + }, return_changes=True)).run() self._vet_result(result, replaced=[0,1], unchanged=[0,1]) if result["replaced"] == 1: if result["changes"][0]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " - "because it was last disclaimed a long time ago " - "at %s", - result["changes"][0]["old_val"]["last_disclaimed"]) + "because it was last claimed a long time ago " + "at %s, and presumably some error stopped it from " + "being disclaimed", + result["changes"][0]["old_val"]["last_claimed"]) site = brozzler.Site(**result["changes"][0]["new_val"]) else: raise brozzler.NothingToClaim diff --git a/brozzler/site.py b/brozzler/site.py index b74d0e7..a6ec52d 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -12,11 +12,13 @@ _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff class Site(brozzler.BaseDictable): logger = logging.getLogger(__module__ + "." + __qualname__) - def __init__(self, seed, id=None, job_id=None, scope=None, proxy=None, - ignore_robots=False, time_limit=None, extra_headers=None, - enable_warcprox_features=False, reached_limit=None, status="ACTIVE", - claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, - last_claimed_by=None): + def __init__( + self, seed, id=None, job_id=None, scope=None, proxy=None, + ignore_robots=False, time_limit=None, extra_headers=None, + enable_warcprox_features=False, reached_limit=None, + status="ACTIVE", claimed=False, start_time=None, + last_disclaimed=_EPOCH_UTC, last_claimed_by=None, + last_claimed=_EPOCH_UTC): self.seed = seed self.id = id @@ -32,6 +34,7 @@ class Site(brozzler.BaseDictable): self.last_claimed_by = last_claimed_by self.start_time = start_time or rethinkstuff.utcnow() self.last_disclaimed = last_disclaimed + self.last_claimed = last_claimed self.scope = scope or {} if not "surt" in self.scope: @@ -44,7 +47,7 @@ class Site(brozzler.BaseDictable): self.ignore_robots, self.extra_headers, self.reached_limit) def __str__(self): - return "site-%s-%s" % (self.id, self.seed) + return "Site-%s-%s" % (self.id, self.seed) def _to_surt(self, url): hurl = surt.handyurl.parse(url)