fix for incorrect reliable to dead state transition

This commit is contained in:
Christien Rioux 2024-06-28 16:38:26 +00:00
parent 4f9e19642c
commit 8aa5c8c5bb

View file

@ -24,7 +24,7 @@ const UNRELIABLE_PING_SPAN_SECS: u32 = 60;
const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5; const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5;
/// How many times do we try to ping a never-reached node before we call it dead /// How many times do we try to ping a never-reached node before we call it dead
const NEVER_REACHED_PING_COUNT: u32 = 3; const NEVER_SEEN_PING_COUNT: u32 = 3;
#[derive(Debug, Copy, Clone, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) enum BucketEntryDeadReason { pub(crate) enum BucketEntryDeadReason {
@ -726,8 +726,8 @@ impl BucketEntryInner {
None => Some(BucketEntryUnreliableReason::NotSeenConsecutively), None => Some(BucketEntryUnreliableReason::NotSeenConsecutively),
// If we have seen the node consistently for longer than UNRELIABLE_PING_SPAN_SECS then it is reliable // If we have seen the node consistently for longer than UNRELIABLE_PING_SPAN_SECS then it is reliable
Some(ts) => { Some(ts) => {
let is_reliable = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64); let seen_consecutively = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
if is_reliable { if seen_consecutively {
None None
} else { } else {
Some(BucketEntryUnreliableReason::InUnreliablePingSpan) Some(BucketEntryUnreliableReason::InUnreliablePingSpan)
@ -738,7 +738,7 @@ impl BucketEntryInner {
pub(super) fn check_dead(&self, cur_ts: Timestamp) -> Option<BucketEntryDeadReason> { pub(super) fn check_dead(&self, cur_ts: Timestamp) -> Option<BucketEntryDeadReason> {
// If we have failed to send NEVER_REACHED_PING_COUNT times in a row, the node is dead // If we have failed to send NEVER_REACHED_PING_COUNT times in a row, the node is dead
if self.peer_stats.rpc_stats.failed_to_send >= NEVER_REACHED_PING_COUNT { if self.peer_stats.rpc_stats.failed_to_send >= NEVER_SEEN_PING_COUNT {
return Some(BucketEntryDeadReason::FailedToSend); return Some(BucketEntryDeadReason::FailedToSend);
} }
@ -746,8 +746,8 @@ impl BucketEntryInner {
// a node is not dead if we haven't heard from it yet, // a node is not dead if we haven't heard from it yet,
// but we give it NEVER_REACHED_PING_COUNT chances to ping before we say it's dead // but we give it NEVER_REACHED_PING_COUNT chances to ping before we say it's dead
None => { None => {
let is_dead = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_REACHED_PING_COUNT; let no_answers = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_SEEN_PING_COUNT;
if is_dead { if no_answers {
Some(BucketEntryDeadReason::TooManyLostAnswers) Some(BucketEntryDeadReason::TooManyLostAnswers)
} else { } else {
None None
@ -755,9 +755,11 @@ impl BucketEntryInner {
} }
// return dead if we have not heard from the node at all for the duration of the unreliable ping span // return dead if we have not heard from the node at all for the duration of the unreliable ping span
// and we have tried to reach it and failed the entire time of unreliable ping span
Some(ts) => { Some(ts) => {
let is_dead = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64); let not_seen = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
if is_dead { let no_answers = self.peer_stats.rpc_stats.recent_lost_answers >= (UNRELIABLE_PING_SPAN_SECS / UNRELIABLE_PING_INTERVAL_SECS);
if not_seen && no_answers {
Some(BucketEntryDeadReason::NoPingResponse) Some(BucketEntryDeadReason::NoPingResponse)
} else { } else {
None None
@ -824,7 +826,7 @@ impl BucketEntryInner {
} }
BucketEntryState::Unreliable => { BucketEntryState::Unreliable => {
// If we are in an unreliable state, we need a ping every UNRELIABLE_PING_INTERVAL_SECS seconds // If we are in an unreliable state, we need a ping every UNRELIABLE_PING_INTERVAL_SECS seconds
self.needs_constant_ping(cur_ts, TimestampDuration::new(UNRELIABLE_PING_INTERVAL_SECS as u64 * 1000000u64)) self.needs_constant_ping(cur_ts, TimestampDuration::new(UNRELIABLE_PING_INTERVAL_SECS as u64 * 1_000_000u64))
} }
BucketEntryState::Dead => { BucketEntryState::Dead => {
error!("Should not be asking this for dead nodes"); error!("Should not be asking this for dead nodes");