fix for incorrect reliable to dead state transition

2025-08-08 14:42:36 -04:00 · 2024-06-28 16:38:26 +00:00 · 2024-06-28 16:38:26 +00:00 · 8aa5c8c5bb
commit 8aa5c8c5bb
parent 4f9e19642c
1 changed files with 11 additions and 9 deletions
--- a/veilid-core/src/routing_table/bucket_entry.rs
+++ b/veilid-core/src/routing_table/bucket_entry.rs
@ -24,7 +24,7 @@ const UNRELIABLE_PING_SPAN_SECS: u32 = 60;
 const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5;

 /// How many times do we try to ping a never-reached node before we call it dead
-const NEVER_REACHED_PING_COUNT: u32 = 3;
+const NEVER_SEEN_PING_COUNT: u32 = 3;

 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub(crate) enum BucketEntryDeadReason {
@ -726,8 +726,8 @@ impl BucketEntryInner {
            None => Some(BucketEntryUnreliableReason::NotSeenConsecutively),
            // If we have seen the node consistently for longer than UNRELIABLE_PING_SPAN_SECS then it is reliable
            Some(ts) => {
-                let is_reliable = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
-                if is_reliable {
+                let seen_consecutively = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
+                if seen_consecutively {
                    None
                } else {
                    Some(BucketEntryUnreliableReason::InUnreliablePingSpan)
@ -738,7 +738,7 @@ impl BucketEntryInner {
    pub(super) fn check_dead(&self, cur_ts: Timestamp) -> Option<BucketEntryDeadReason> {
        
        // If we have failed to send NEVER_REACHED_PING_COUNT times in a row, the node is dead
-        if self.peer_stats.rpc_stats.failed_to_send >= NEVER_REACHED_PING_COUNT {
+        if self.peer_stats.rpc_stats.failed_to_send >= NEVER_SEEN_PING_COUNT {
            return Some(BucketEntryDeadReason::FailedToSend);
        }

@ -746,8 +746,8 @@ impl BucketEntryInner {
            // a node is not dead if we haven't heard from it yet,
            // but we give it NEVER_REACHED_PING_COUNT chances to ping before we say it's dead
            None => {
-                let is_dead = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_REACHED_PING_COUNT;
-                if is_dead {
+                let no_answers = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_SEEN_PING_COUNT;
+                if no_answers {
                    Some(BucketEntryDeadReason::TooManyLostAnswers)
                } else {
                    None
@ -755,9 +755,11 @@ impl BucketEntryInner {
            }
            
            // return dead if we have not heard from the node at all for the duration of the unreliable ping span
+            // and we have tried to reach it and failed the entire time of unreliable ping span
            Some(ts) => {
-                let is_dead = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
-                if is_dead {
+                let not_seen = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64);
+                let no_answers = self.peer_stats.rpc_stats.recent_lost_answers >= (UNRELIABLE_PING_SPAN_SECS / UNRELIABLE_PING_INTERVAL_SECS);
+                if not_seen && no_answers {
                    Some(BucketEntryDeadReason::NoPingResponse)
                } else {
                    None
@ -824,7 +826,7 @@ impl BucketEntryInner {
            }
            BucketEntryState::Unreliable => {
                // If we are in an unreliable state, we need a ping every UNRELIABLE_PING_INTERVAL_SECS seconds
-                self.needs_constant_ping(cur_ts, TimestampDuration::new(UNRELIABLE_PING_INTERVAL_SECS as u64 * 1000000u64))
+                self.needs_constant_ping(cur_ts, TimestampDuration::new(UNRELIABLE_PING_INTERVAL_SECS as u64 * 1_000_000u64))
            }
            BucketEntryState::Dead => {
                error!("Should not be asking this for dead nodes");