diff --git a/veilid-core/src/network_manager/address_filter.rs b/veilid-core/src/network_manager/address_filter.rs index f2b3c3dd..7cd281b9 100644 --- a/veilid-core/src/network_manager/address_filter.rs +++ b/veilid-core/src/network_manager/address_filter.rs @@ -27,9 +27,9 @@ struct AddressFilterInner { conn_count_by_ip6_prefix: BTreeMap, conn_timestamps_by_ip4: BTreeMap>, conn_timestamps_by_ip6_prefix: BTreeMap>, - punishments_by_ip4: BTreeMap, - punishments_by_ip6_prefix: BTreeMap, - punishments_by_node_id: BTreeMap, + punishments_by_ip4: BTreeMap, + punishments_by_ip6_prefix: BTreeMap, + punishments_by_node_id: BTreeMap, dial_info_failures: BTreeMap, } @@ -194,7 +194,7 @@ impl AddressFilter { inner.punishments_by_node_id.remove(&key); // make the entry alive again if it's still here if let Ok(Some(nr)) = self.unlocked_inner.routing_table.lookup_node_ref(key) { - nr.operate_mut(|_rti, e| e.set_punished(false)); + nr.operate_mut(|_rti, e| e.set_punished(None)); } } } @@ -314,10 +314,10 @@ impl AddressFilter { self.is_node_id_punished_inner(&inner, node_id) } - pub fn punish_node_id(&self, node_id: TypedKey) { + pub fn punish_node_id(&self, node_id: TypedKey, punishment_reason: PunishmentReason) { if let Ok(Some(nr)) = self.unlocked_inner.routing_table.lookup_node_ref(node_id) { // make the entry dead if it's punished - nr.operate_mut(|_rti, e| e.set_punished(true)); + nr.operate_mut(|_rti, e| e.set_punished(Some(punishment_reason))); } let ts = get_aligned_timestamp(); @@ -327,7 +327,7 @@ impl AddressFilter { log_net!(debug ">>> PUNISHMENT TABLE FULL: {}", node_id); return; } - log_net!(debug ">>> PUNISHED: {}", node_id); + log_net!(debug ">>> PUNISHED: {} for {:?}", node_id, punishment_reason); inner .punishments_by_node_id .entry(node_id) diff --git a/veilid-core/src/network_manager/types/mod.rs b/veilid-core/src/network_manager/types/mod.rs index 56aba08d..275ab640 100644 --- a/veilid-core/src/network_manager/types/mod.rs +++ b/veilid-core/src/network_manager/types/mod.rs @@ -8,6 +8,7 @@ mod low_level_protocol_type; mod network_class; mod peer_address; mod protocol_type; +mod punishment; mod relay_kind; mod signal_info; mod socket_address; @@ -24,6 +25,7 @@ pub use low_level_protocol_type::*; pub use network_class::*; pub use peer_address::*; pub use protocol_type::*; +pub use punishment::*; pub use relay_kind::*; pub use signal_info::*; pub use socket_address::*; diff --git a/veilid-core/src/routing_table/bucket.rs b/veilid-core/src/routing_table/bucket.rs index 2f367aa3..63a3d3d5 100644 --- a/veilid-core/src/routing_table/bucket.rs +++ b/veilid-core/src/routing_table/bucket.rs @@ -142,8 +142,8 @@ impl Bucket { } a.1.with_inner(|ea| { b.1.with_inner(|eb| { - let astate = state_ordering(ea.state(cur_ts)); - let bstate = state_ordering(eb.state(cur_ts)); + let astate = state_ordering(ea.state_reason(cur_ts)); + let bstate = state_ordering(eb.state_reason(cur_ts)); // first kick dead nodes, then unreliable nodes if astate < bstate { return core::cmp::Ordering::Less; diff --git a/veilid-core/src/routing_table/bucket_entry.rs b/veilid-core/src/routing_table/bucket_entry.rs index 5497f81f..5a3218d0 100644 --- a/veilid-core/src/routing_table/bucket_entry.rs +++ b/veilid-core/src/routing_table/bucket_entry.rs @@ -26,15 +26,56 @@ const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5; /// How many times do we try to ping a never-reached node before we call it dead const NEVER_REACHED_PING_COUNT: u32 = 3; -// Do not change order here, it will mess up other sorts +// Bucket entry state reasons +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum BucketEntryPunishedReason { + Punished, +} -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum BucketEntryDeadReason { + FailedToSend, + TooManyLostAnswers, + NoPingResponse, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum BucketEntryUnreliableReason { + FailedToSend, + LostAnswers, + NotSeenConsecutively, + InUnreliablePingSpan, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub(crate) enum BucketEntryStateReason { + Punished(BucketEntryPunishedReason), + Dead(BucketEntryDeadReason), + Unreliable(BucketEntryUnreliableReason), + Reliable, +} + +// Do not change order here, it will mess up other sorts +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] pub(crate) enum BucketEntryState { + Punished, Dead, Unreliable, Reliable, } +impl From for BucketEntryState { + fn from(value: BucketEntryStateReason) -> Self + { + match value { + BucketEntryStateReason::Dead(_) => BucketEntryState::Dead, + BucketEntryStateReason::Unreliable(_) => BucketEntryState::Unreliable, + BucketEntryStateReason::Reliable => BucketEntryState::Reliable, + } + } +} + + #[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub(crate) struct LastFlowKey(ProtocolType, AddressType); @@ -92,7 +133,7 @@ pub(crate) struct BucketEntryInner { transfer_stats_accounting: TransferStatsAccounting, /// If the entry is being punished and should be considered dead #[serde(skip)] - is_punished: bool, + is_punished: Option, /// Tracking identifier for NodeRef debugging #[cfg(feature = "tracking")] #[serde(skip)] @@ -202,7 +243,7 @@ impl BucketEntryInner { // Less is more reliable then faster pub fn cmp_fastest_reliable(cur_ts: Timestamp, e1: &Self, e2: &Self) -> std::cmp::Ordering { // Reverse compare so most reliable is at front - let ret = e2.state(cur_ts).cmp(&e1.state(cur_ts)); + let ret = e2.state_reason(cur_ts).cmp(&e1.state_reason(cur_ts)); if ret != std::cmp::Ordering::Equal { return ret; } @@ -224,7 +265,7 @@ impl BucketEntryInner { // Less is more reliable then older pub fn cmp_oldest_reliable(cur_ts: Timestamp, e1: &Self, e2: &Self) -> std::cmp::Ordering { // Reverse compare so most reliable is at front - let ret = e2.state(cur_ts).cmp(&e1.state(cur_ts)); + let ret = e2.state_reason(cur_ts).cmp(&e1.state_reason(cur_ts)); if ret != std::cmp::Ordering::Equal { return ret; } @@ -558,19 +599,17 @@ impl BucketEntryInner { self.envelope_support.iter().rev().find(|x| VALID_ENVELOPE_VERSIONS.contains(x)).copied() } - pub fn state(&self, cur_ts: Timestamp) -> BucketEntryState { - if self.is_punished { - return BucketEntryState::Dead; - } - if self.check_reliable(cur_ts) { - BucketEntryState::Reliable - } else if self.check_dead(cur_ts) { - BucketEntryState::Dead + pub fn state_reason(&self, cur_ts: Timestamp) -> BucketEntryStateReason { + + if Some(dead_reason) = self.check_dead(cur_ts) { + BucketEntryStateReason::Dead(dead_reason) + } else if Some(unreliable_reason) = self.check_unreliable(cur_ts) { + BucketEntryStateReason::Unreliable(unreliable_reason) } else { - BucketEntryState::Unreliable + BucketEntryStateReason::Reliable } } - pub fn set_punished(&mut self, punished: bool) { + pub fn set_punished(&mut self, punished: Option) { self.is_punished = punished; if punished { self.clear_last_flows(); @@ -650,40 +689,58 @@ impl BucketEntryInner { } ///// state machine handling - pub(super) fn check_reliable(&self, cur_ts: Timestamp) -> bool { + pub(super) fn check_unreliable(&self, cur_ts: Timestamp) -> Option { // If we have had any failures to send, this is not reliable if self.peer_stats.rpc_stats.failed_to_send > 0 { - return false; + return Some(BucketEntryUnreliableReason::FailedToSend); } // If we have had any lost answers recently, this is not reliable if self.peer_stats.rpc_stats.recent_lost_answers > 0 { - return false; + return Some(BucketEntryUnreliableReason::LostAnswers); } match self.peer_stats.rpc_stats.first_consecutive_seen_ts { // If we have not seen seen a node consecutively, it can't be reliable - None => false, + None => Some(BucketEntryUnreliableReason::NotSeenConsecutively), // If we have seen the node consistently for longer than UNRELIABLE_PING_SPAN_SECS then it is reliable Some(ts) => { - cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64) + let is_reliable = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64); + if is_reliable { + None + } else { + Some(BucketEntryUnreliableReason::InUnreliablePingSpan) + } } } } - pub(super) fn check_dead(&self, cur_ts: Timestamp) -> bool { + pub(super) fn check_dead(&self, cur_ts: Timestamp) -> Option { + // If we have failed to send NEVER_REACHED_PING_COUNT times in a row, the node is dead if self.peer_stats.rpc_stats.failed_to_send >= NEVER_REACHED_PING_COUNT { - return true; + return Some(BucketEntryDeadReason::FailedToSend); } match self.peer_stats.rpc_stats.last_seen_ts { // a node is not dead if we haven't heard from it yet, // but we give it NEVER_REACHED_PING_COUNT chances to ping before we say it's dead - None => self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_REACHED_PING_COUNT, + None => { + let is_dead = self.peer_stats.rpc_stats.recent_lost_answers >= NEVER_REACHED_PING_COUNT; + if is_dead { + Some(BucketEntryDeadReason::TooManyLostAnswers) + } else { + None + } + } // return dead if we have not heard from the node at all for the duration of the unreliable ping span Some(ts) => { - cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64) + let is_dead = cur_ts.saturating_sub(ts) >= TimestampDuration::new(UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64); + if is_dead { + Some(BucketEntryDeadReason::NoPingResponse) + } else { + None + } } } } @@ -712,7 +769,7 @@ impl BucketEntryInner { // Check if this node needs a ping right now to validate it is still reachable pub(super) fn needs_ping(&self, cur_ts: Timestamp) -> bool { // See which ping pattern we are to use - let state = self.state(cur_ts); + let state = self.state_reason(cur_ts); match state { BucketEntryState::Reliable => { @@ -798,7 +855,7 @@ impl BucketEntryInner { format!( "state: {:?}, first_consecutive_seen_ts: {}, last_seen_ts: {}", - self.state(cur_ts), + self.state_reason(cur_ts), first_consecutive_seen_ts, last_seen_ts_str ) diff --git a/veilid-core/src/routing_table/debug.rs b/veilid-core/src/routing_table/debug.rs index 3691d279..b58969e6 100644 --- a/veilid-core/src/routing_table/debug.rs +++ b/veilid-core/src/routing_table/debug.rs @@ -134,7 +134,7 @@ impl RoutingTable { let cap_match = e.1.with(inner, |_rti, e| { e.has_all_capabilities(RoutingDomain::PublicInternet, &capabilities) }); - let state = e.1.with(inner, |_rti, e| e.state(cur_ts)); + let state = e.1.with(inner, |_rti, e| e.state_reason(cur_ts)); state >= min_state && cap_match }) .collect(); @@ -142,7 +142,7 @@ impl RoutingTable { if !filtered_entries.is_empty() { out += &format!("{} Bucket #{}:\n", ck, b); for e in filtered_entries { - let state = e.1.with(inner, |_rti, e| e.state(cur_ts)); + let state = e.1.with(inner, |_rti, e| e.state_reason(cur_ts)); out += &format!( " {} [{}] {} [{}]\n", e.0.encode(), @@ -210,7 +210,9 @@ impl RoutingTable { while c < COLS { let mut cnt = 0; for e in inner.buckets[ck][b].entries() { - if e.1.with(inner, |_rti, e| e.state(cur_ts) >= min_state) { + if e.1 + .with(inner, |_rti, e| e.state_reason(cur_ts) >= min_state) + { cnt += 1; } } diff --git a/veilid-core/src/routing_table/node_ref.rs b/veilid-core/src/routing_table/node_ref.rs index 5abc3dd0..5165db4d 100644 --- a/veilid-core/src/routing_table/node_ref.rs +++ b/veilid-core/src/routing_table/node_ref.rs @@ -130,7 +130,7 @@ pub(crate) trait NodeRefBase: Sized { self.operate(|_rti, e| e.best_envelope_version()) } fn state(&self, cur_ts: Timestamp) -> BucketEntryState { - self.operate(|_rti, e| e.state(cur_ts)) + self.operate(|_rti, e| e.state_reason(cur_ts)) } fn peer_stats(&self) -> PeerStats { self.operate(|_rti, e| e.peer_stats().clone()) diff --git a/veilid-core/src/routing_table/routing_table_inner.rs b/veilid-core/src/routing_table/routing_table_inner.rs index bca042f7..d912aa95 100644 --- a/veilid-core/src/routing_table/routing_table_inner.rs +++ b/veilid-core/src/routing_table/routing_table_inner.rs @@ -479,7 +479,7 @@ impl RoutingTableInner { mut f: F, ) -> Option { for entry in &self.all_entries { - if entry.with_inner(|e| e.state(cur_ts) >= min_state) { + if entry.with_inner(|e| e.state_reason(cur_ts) >= min_state) { if let Some(out) = f(self, entry) { return Some(out); } @@ -498,7 +498,7 @@ impl RoutingTableInner { ) -> Option { let mut entries = Vec::with_capacity(self.all_entries.len()); for entry in self.all_entries.iter() { - if entry.with_inner(|e| e.state(cur_ts) >= min_state) { + if entry.with_inner(|e| e.state_reason(cur_ts) >= min_state) { entries.push(entry); } } @@ -879,7 +879,7 @@ impl RoutingTableInner { let cur_ts = get_aligned_timestamp(); for entry in self.all_entries.iter() { - match entry.with_inner(|e| e.state(cur_ts)) { + match entry.with_inner(|e| e.state_reason(cur_ts)) { BucketEntryState::Reliable => { reliable_entry_count += 1; } @@ -1070,7 +1070,7 @@ impl RoutingTableInner { move |_rti: &RoutingTableInner, v: Option>| { if let Some(entry) = &v { // always filter out dead nodes - !entry.with_inner(|e| e.state(cur_ts) == BucketEntryState::Dead) + !entry.with_inner(|e| e.state_reason(cur_ts) == BucketEntryState::Dead) } else { // always filter out self peer, as it is irrelevant to the 'fastest nodes' search false @@ -1106,8 +1106,8 @@ impl RoutingTableInner { let be = b_entry.as_ref().unwrap(); ae.with_inner(|ae| { be.with_inner(|be| { - let ra = ae.check_reliable(cur_ts); - let rb = be.check_reliable(cur_ts); + let ra = ae.check_unreliable(cur_ts); + let rb = be.check_unreliable(cur_ts); if ra != rb { if ra { return core::cmp::Ordering::Less; @@ -1189,10 +1189,10 @@ impl RoutingTableInner { // reliable nodes come first, pessimistically treating our own node as unreliable let ra = a_entry .as_ref() - .map_or(false, |x| x.with_inner(|x| x.check_reliable(cur_ts))); + .map_or(false, |x| x.with_inner(|x| x.check_unreliable(cur_ts))); let rb = b_entry .as_ref() - .map_or(false, |x| x.with_inner(|x| x.check_reliable(cur_ts))); + .map_or(false, |x| x.with_inner(|x| x.check_unreliable(cur_ts))); if ra != rb { if ra { return core::cmp::Ordering::Less; diff --git a/veilid-core/src/routing_table/tasks/kick_buckets.rs b/veilid-core/src/routing_table/tasks/kick_buckets.rs index dd58472e..39628492 100644 --- a/veilid-core/src/routing_table/tasks/kick_buckets.rs +++ b/veilid-core/src/routing_table/tasks/kick_buckets.rs @@ -52,7 +52,7 @@ impl RoutingTable { continue; } - let state = entry.with(&inner, |_rti, e| e.state(cur_ts)); + let state = entry.with(&inner, |_rti, e| e.state_reason(cur_ts)); match state { BucketEntryState::Dead => { // Do nothing with dead entries