fix for incorrect reliable to dead state transition

This commit is contained in:
Christien Rioux 2024-06-28 16:38:26 +00:00
parent 4f9e19642c
commit 8aa5c8c5bb

View File

@ -24,7 +24,7 @@ const UNRELIABLE_PING_SPAN_SECS: u32 = 60;
const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5;
/// How many times do we try to ping a never-reached node before we call it dead
const NEVER_REACHED_PING_COUNT: u32 = 3;
const NEVER_SEEN_PING_COUNT: u32 = 3;
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) enum BucketEntryDeadReason {
@ -726,8 +726,8 @@ impl BucketEntryInner {
None => Some(BucketEntryUnreliableReason::NotSeenConsecutively),
// If we have seen the node consistently for longer than UNRELIABLE_PING_SPAN_SECS then it is reliable
Some(ts) => {
let is_reliable = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
if is_reliable {
let seen_consecutively = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
if seen_consecutively {
None
} else {
Some(BucketEntryUnreliableReason::InUnreliablePingSpan)
@ -738,7 +738,7 @@ impl BucketEntryInner {
pub(super) fn check_dead(&self, cur_ts: Timestamp) -> Option<BucketEntryDeadReason> {
// If we have failed to send NEVER_REACHED_PING_COUNT times in a row, the node is dead
if self.peer_stats.rpc_stats.failed_to_send >= NEVER_REACHED_PING_COUNT {
if self.peer_stats.rpc_stats.failed_to_send >= NEVER_SEEN_PING_COUNT {
return Some(BucketEntryDeadReason::FailedToSend);
}
@ -746,8 +746,8 @@ impl BucketEntryInner {
// a node is not dead if we haven't heard from it yet,
// but we give it NEVER_REACHED_PING_COUNT chances to ping before we say it's dead
None => {
let is_dead = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_REACHED_PING_COUNT;
if is_dead {
let no_answers = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_SEEN_PING_COUNT;
if no_answers {
Some(BucketEntryDeadReason::TooManyLostAnswers)
} else {
None
@ -755,9 +755,11 @@ impl BucketEntryInner {
}
// return dead if we have not heard from the node at all for the duration of the unreliable ping span
// and we have tried to reach it and failed the entire time of unreliable ping span
Some(ts) => {
let is_dead = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
if is_dead {
let not_seen = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
let no_answers = self.peer_stats.rpc_stats.recent_lost_answers >= (UNRELIABLE_PING_SPAN_SECS / UNRELIABLE_PING_INTERVAL_SECS);
if not_seen && no_answers {
Some(BucketEntryDeadReason::NoPingResponse)
} else {
None
@ -824,7 +826,7 @@ impl BucketEntryInner {
}
BucketEntryState::Unreliable => {
// If we are in an unreliable state, we need a ping every UNRELIABLE_PING_INTERVAL_SECS seconds
self.needs_constant_ping(cur_ts, TimestampDuration::new(UNRELIABLE_PING_INTERVAL_SECS as u64 * 1000000u64))
self.needs_constant_ping(cur_ts, TimestampDuration::new(UNRELIABLE_PING_INTERVAL_SECS as u64 * 1_000_000u64))
}
BucketEntryState::Dead => {
error!("Should not be asking this for dead nodes");