mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-23 22:24:52 -04:00
fix bug where the first time a site was claimed, another brozzler-worker would claim it anyway (and find no pages to brozzle)
This commit is contained in:
parent
416aa064f8
commit
4bbbbcf138
2 changed files with 19 additions and 12 deletions
|
@ -99,19 +99,23 @@ class RethinkDbFrontier:
|
||||||
.order_by(index="sites_last_disclaimed")
|
.order_by(index="sites_last_disclaimed")
|
||||||
.filter(
|
.filter(
|
||||||
(rethinkdb.row["claimed"] != True) |
|
(rethinkdb.row["claimed"] != True) |
|
||||||
(rethinkdb.row["last_disclaimed"]
|
(rethinkdb.row["last_claimed"]
|
||||||
< rethinkdb.now() - 2*60*60))
|
< rethinkdb.now() - 2*60*60))
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.update({"claimed":True,"last_claimed_by":worker_id},
|
.update({
|
||||||
return_changes=True)).run()
|
"claimed": True,
|
||||||
|
"last_claimed_by": worker_id,
|
||||||
|
"last_claimed": rethinkstuff.utcnow(),
|
||||||
|
}, return_changes=True)).run()
|
||||||
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
|
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
|
||||||
if result["replaced"] == 1:
|
if result["replaced"] == 1:
|
||||||
if result["changes"][0]["old_val"]["claimed"]:
|
if result["changes"][0]["old_val"]["claimed"]:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"re-claimed site that was still marked 'claimed' "
|
"re-claimed site that was still marked 'claimed' "
|
||||||
"because it was last disclaimed a long time ago "
|
"because it was last claimed a long time ago "
|
||||||
"at %s",
|
"at %s, and presumably some error stopped it from "
|
||||||
result["changes"][0]["old_val"]["last_disclaimed"])
|
"being disclaimed",
|
||||||
|
result["changes"][0]["old_val"]["last_claimed"])
|
||||||
site = brozzler.Site(**result["changes"][0]["new_val"])
|
site = brozzler.Site(**result["changes"][0]["new_val"])
|
||||||
else:
|
else:
|
||||||
raise brozzler.NothingToClaim
|
raise brozzler.NothingToClaim
|
||||||
|
|
|
@ -12,11 +12,13 @@ _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff
|
||||||
class Site(brozzler.BaseDictable):
|
class Site(brozzler.BaseDictable):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, seed, id=None, job_id=None, scope=None, proxy=None,
|
def __init__(
|
||||||
ignore_robots=False, time_limit=None, extra_headers=None,
|
self, seed, id=None, job_id=None, scope=None, proxy=None,
|
||||||
enable_warcprox_features=False, reached_limit=None, status="ACTIVE",
|
ignore_robots=False, time_limit=None, extra_headers=None,
|
||||||
claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC,
|
enable_warcprox_features=False, reached_limit=None,
|
||||||
last_claimed_by=None):
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
|
last_claimed=_EPOCH_UTC):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -32,6 +34,7 @@ class Site(brozzler.BaseDictable):
|
||||||
self.last_claimed_by = last_claimed_by
|
self.last_claimed_by = last_claimed_by
|
||||||
self.start_time = start_time or rethinkstuff.utcnow()
|
self.start_time = start_time or rethinkstuff.utcnow()
|
||||||
self.last_disclaimed = last_disclaimed
|
self.last_disclaimed = last_disclaimed
|
||||||
|
self.last_claimed = last_claimed
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
|
@ -44,7 +47,7 @@ class Site(brozzler.BaseDictable):
|
||||||
self.ignore_robots, self.extra_headers, self.reached_limit)
|
self.ignore_robots, self.extra_headers, self.reached_limit)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "site-%s-%s" % (self.id, self.seed)
|
return "Site-%s-%s" % (self.id, self.seed)
|
||||||
|
|
||||||
def _to_surt(self, url):
|
def _to_surt(self, url):
|
||||||
hurl = surt.handyurl.parse(url)
|
hurl = surt.handyurl.parse(url)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue