From e0c20ed4c909c2d9b2b508141dbdfde8a67a2110 Mon Sep 17 00:00:00 2001 From: Christien Rioux Date: Mon, 5 Aug 2024 16:10:19 -0500 Subject: [PATCH] improve error handling for network errors make relay requirements more aggressive --- .../network_manager/native/protocol/tcp.rs | 2 +- .../src/network_manager/native/protocol/ws.rs | 6 ++++- .../src/network_manager/network_connection.rs | 8 +++---- .../routing_table/tasks/relay_management.rs | 6 +++++ veilid-core/src/rpc_processor/mod.rs | 22 +++++++++++++++---- veilid-core/src/veilid_api/types/stats.rs | 2 +- veilid-tools/src/network_result.rs | 2 ++ 7 files changed, 37 insertions(+), 11 deletions(-) diff --git a/veilid-core/src/network_manager/native/protocol/tcp.rs b/veilid-core/src/network_manager/native/protocol/tcp.rs index 0acbb97d..c2919c77 100644 --- a/veilid-core/src/network_manager/native/protocol/tcp.rs +++ b/veilid-core/src/network_manager/native/protocol/tcp.rs @@ -151,7 +151,7 @@ impl RawTcpProtocolHandler { ps, )); - log_net!(debug "Connection accepted from: {} (TCP)", socket_addr); + log_net!("Connection accepted from: {} (TCP)", socket_addr); Ok(Some(conn)) } diff --git a/veilid-core/src/network_manager/native/protocol/ws.rs b/veilid-core/src/network_manager/native/protocol/ws.rs index 471fde5f..6ef18bec 100644 --- a/veilid-core/src/network_manager/native/protocol/ws.rs +++ b/veilid-core/src/network_manager/native/protocol/ws.rs @@ -286,7 +286,11 @@ impl WebsocketProtocolHandler { ws_stream, )); - log_net!(debug "Connection accepted from: {} ({})", socket_addr, if self.arc.tls { "WSS" } else { "WS" }); + log_net!( + "Connection accepted from: {} ({})", + socket_addr, + if self.arc.tls { "WSS" } else { "WS" } + ); Ok(Some(conn)) } diff --git a/veilid-core/src/network_manager/network_connection.rs b/veilid-core/src/network_manager/network_connection.rs index ca63d891..b55795e8 100644 --- a/veilid-core/src/network_manager/network_connection.rs +++ b/veilid-core/src/network_manager/network_connection.rs @@ -279,7 +279,7 @@ impl NetworkConnection { ) -> SendPinBoxFuture<()> { Box::pin(async move { log_net!( - "== Starting process_connection loop for id={}, {:?}", connection_id, + "Starting process_connection loop for id={}, {:?}", connection_id, flow ); @@ -294,7 +294,7 @@ impl NetworkConnection { let new_timer = || { sleep(connection_manager.connection_inactivity_timeout_ms()).then(|_| async { // timeout - log_net!("== Connection timeout on {:?}", flow); + log_net!("Connection timeout on {:?}", flow); RecvLoopAction::Timeout }) }; @@ -354,7 +354,7 @@ impl NetworkConnection { // Check for connection close if v.is_no_connection() { - log_net!(debug "Connection closed from: {} ({})", peer_address.socket_addr(), peer_address.protocol_type()); + log_net!("Connection closed from: {} ({})", peer_address.socket_addr(), peer_address.protocol_type()); return RecvLoopAction::Finish; } @@ -428,7 +428,7 @@ impl NetworkConnection { } log_net!( - "== Connection loop finished flow={:?}", + "Connection loop finished flow={:?}", flow ); diff --git a/veilid-core/src/routing_table/tasks/relay_management.rs b/veilid-core/src/routing_table/tasks/relay_management.rs index 1577d8bd..e9e9d647 100644 --- a/veilid-core/src/routing_table/tasks/relay_management.rs +++ b/veilid-core/src/routing_table/tasks/relay_management.rs @@ -172,6 +172,12 @@ impl RoutingTable { return false; }; + // Exclude any nodes that have 'failed to send' state indicating a + // connection drop or inability to reach the node + if e.peer_stats().rpc_stats.failed_to_send > 0 { + return false; + } + // Until we have a way of reducing a SignedRelayedNodeInfo to a SignedDirectNodeInfo // See https://gitlab.com/veilid/veilid/-/issues/381 // We should consider nodes with allocated relays as disqualified from being a relay themselves diff --git a/veilid-core/src/rpc_processor/mod.rs b/veilid-core/src/rpc_processor/mod.rs index 63316e49..48d39af4 100644 --- a/veilid-core/src/rpc_processor/mod.rs +++ b/veilid-core/src/rpc_processor/mod.rs @@ -924,7 +924,7 @@ impl RPCProcessor { remote_private_route: Option, ) { let wants_answer = matches!(rpc_kind, RPCKind::Question); - + // Record for node if this was not sent via a route if safety_route.is_none() && remote_private_route.is_none() { node_ref.stats_failed_to_send(send_ts, wants_answer); @@ -1500,7 +1500,7 @@ impl RPCProcessor { }, // Ignored messages that should be dropped RPCError::Ignore(_) | RPCError::Network(_) | RPCError::TryAgain(_) => { - log_rpc!(debug "Dropping RPC Operation: {}", e); + log_rpc!("Dropping RPC Operation: {}", e); }, // Internal errors that deserve louder logging RPCError::Unimplemented(_) | RPCError::Internal(_) => { @@ -1582,7 +1582,7 @@ impl RPCProcessor { Ok(v) => v, Err(e) => { // Debug on error - log_rpc!(debug "Dropping RPC operation: {}", e); + log_rpc!(debug "Dropping routed RPC: {}", e); // XXX: Punish routes that send routed undecodable crap // address_filter.punish_route_id(xxx, PunishmentReason::FailedToDecodeRoutedMessage); @@ -1666,7 +1666,21 @@ impl RPCProcessor { if let Err(e) = self.unlocked_inner .waiting_rpc_table .complete_op_waiter(op_id, msg) { - log_rpc!(debug "Operation id {} did not complete: {}", op_id, e); + match e { + RPCError::Unimplemented(_) | + RPCError::Internal(_) => { + log_rpc!(error "Could not complete rpc operation: id = {}: {}", op_id, e); + }, + RPCError::InvalidFormat(_) | + RPCError::Protocol(_) | + RPCError::Network(_) | + RPCError::TryAgain(_) => { + log_rpc!(debug "Could not complete rpc operation: id = {}: {}", op_id, e); + }, + RPCError::Ignore(_) => { + log_rpc!("Answer late: id = {}", op_id); + }, + }; // Don't throw an error here because it's okay if the original operation timed out } Ok(NetworkResult::value(())) diff --git a/veilid-core/src/veilid_api/types/stats.rs b/veilid-core/src/veilid_api/types/stats.rs index 5a6b27a8..c5d4ea55 100644 --- a/veilid-core/src/veilid_api/types/stats.rs +++ b/veilid-core/src/veilid_api/types/stats.rs @@ -34,7 +34,7 @@ pub struct RPCStats { pub last_seen_ts: Option, // when the peer was last seen for any reason, including when we first attempted to reach out to it pub first_consecutive_seen_ts: Option, // the timestamp of the first consecutive proof-of-life for this node (an answer or received question) pub recent_lost_answers: u32, // number of answers that have been lost since we lost reliability - pub failed_to_send: u32, // number of messages that have failed to send since we last successfully sent one + pub failed_to_send: u32, // number of messages that have failed to send or connections dropped since we last successfully sent one } #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] diff --git a/veilid-tools/src/network_result.rs b/veilid-tools/src/network_result.rs index faf63260..9fe1698a 100644 --- a/veilid-tools/src/network_result.rs +++ b/veilid-tools/src/network_result.rs @@ -32,6 +32,7 @@ impl IoNetworkResultExt for io::Result { // Err(e) => match e.kind() { // io::ErrorKind::TimedOut => Ok(NetworkResult::Timeout), // io::ErrorKind::UnexpectedEof + // | io::ErrorKind::NotConnected // | io::ErrorKind::BrokenPipe // | io::ErrorKind::ConnectionAborted // | io::ErrorKind::ConnectionRefused @@ -52,6 +53,7 @@ impl IoNetworkResultExt for io::Result { match e.kind() { io::ErrorKind::TimedOut => Ok(NetworkResult::Timeout), io::ErrorKind::UnexpectedEof + | io::ErrorKind::NotConnected | io::ErrorKind::BrokenPipe | io::ErrorKind::ConnectionAborted | io::ErrorKind::ConnectionRefused