mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
fix bug where the first time a site was claimed, another brozzler-worker would claim it anyway (and find no pages to brozzle)
This commit is contained in:
parent
416aa064f8
commit
4bbbbcf138
@ -99,19 +99,23 @@ class RethinkDbFrontier:
|
||||
.order_by(index="sites_last_disclaimed")
|
||||
.filter(
|
||||
(rethinkdb.row["claimed"] != True) |
|
||||
(rethinkdb.row["last_disclaimed"]
|
||||
(rethinkdb.row["last_claimed"]
|
||||
< rethinkdb.now() - 2*60*60))
|
||||
.limit(1)
|
||||
.update({"claimed":True,"last_claimed_by":worker_id},
|
||||
return_changes=True)).run()
|
||||
.update({
|
||||
"claimed": True,
|
||||
"last_claimed_by": worker_id,
|
||||
"last_claimed": rethinkstuff.utcnow(),
|
||||
}, return_changes=True)).run()
|
||||
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
|
||||
if result["replaced"] == 1:
|
||||
if result["changes"][0]["old_val"]["claimed"]:
|
||||
self.logger.warn(
|
||||
"re-claimed site that was still marked 'claimed' "
|
||||
"because it was last disclaimed a long time ago "
|
||||
"at %s",
|
||||
result["changes"][0]["old_val"]["last_disclaimed"])
|
||||
"because it was last claimed a long time ago "
|
||||
"at %s, and presumably some error stopped it from "
|
||||
"being disclaimed",
|
||||
result["changes"][0]["old_val"]["last_claimed"])
|
||||
site = brozzler.Site(**result["changes"][0]["new_val"])
|
||||
else:
|
||||
raise brozzler.NothingToClaim
|
||||
|
@ -12,11 +12,13 @@ _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff
|
||||
class Site(brozzler.BaseDictable):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, seed, id=None, job_id=None, scope=None, proxy=None,
|
||||
def __init__(
|
||||
self, seed, id=None, job_id=None, scope=None, proxy=None,
|
||||
ignore_robots=False, time_limit=None, extra_headers=None,
|
||||
enable_warcprox_features=False, reached_limit=None, status="ACTIVE",
|
||||
claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC,
|
||||
last_claimed_by=None):
|
||||
enable_warcprox_features=False, reached_limit=None,
|
||||
status="ACTIVE", claimed=False, start_time=None,
|
||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||
last_claimed=_EPOCH_UTC):
|
||||
|
||||
self.seed = seed
|
||||
self.id = id
|
||||
@ -32,6 +34,7 @@ class Site(brozzler.BaseDictable):
|
||||
self.last_claimed_by = last_claimed_by
|
||||
self.start_time = start_time or rethinkstuff.utcnow()
|
||||
self.last_disclaimed = last_disclaimed
|
||||
self.last_claimed = last_claimed
|
||||
|
||||
self.scope = scope or {}
|
||||
if not "surt" in self.scope:
|
||||
@ -44,7 +47,7 @@ class Site(brozzler.BaseDictable):
|
||||
self.ignore_robots, self.extra_headers, self.reached_limit)
|
||||
|
||||
def __str__(self):
|
||||
return "site-%s-%s" % (self.id, self.seed)
|
||||
return "Site-%s-%s" % (self.id, self.seed)
|
||||
|
||||
def _to_surt(self, url):
|
||||
hurl = surt.handyurl.parse(url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user