mirror of
https://gitlab.com/veilid/veilid.git
synced 2025-10-11 10:08:29 -04:00
Tick lag
This commit is contained in:
parent
34c82ec128
commit
0ca19d6207
22 changed files with 650 additions and 367 deletions
|
@ -38,6 +38,7 @@
|
|||
- `RecordKey`s are now validated on both server side and client side of DHT RPC operations, closes [#299](https://gitlab.com/veilid/veilid/-/issues/299)
|
||||
- Revert punishment for FailedToVerifySenderPeerInfo, with a better peer info filter, fixes [#470](https://gitlab.com/veilid/veilid/-/issues/470)
|
||||
- Update keyring-manager to eliminate licensing issue
|
||||
- Added 'tick lag' detection to check for missed watch updates
|
||||
|
||||
- veilid-python:
|
||||
- Correction of type hints
|
||||
|
|
|
@ -28,11 +28,11 @@ fn format_ts(ts: &json::JsonValue) -> String {
|
|||
return "---".to_owned();
|
||||
}
|
||||
let ts = json_str_u64(ts);
|
||||
let secs = timestamp_to_secs(ts);
|
||||
let secs = timestamp_duration_to_secs(ts);
|
||||
if secs >= 1.0 {
|
||||
format!("{:.2}s", timestamp_to_secs(ts))
|
||||
format!("{:.2}s", secs)
|
||||
} else {
|
||||
format!("{:.2}ms", timestamp_to_secs(ts) * 1000.0)
|
||||
format!("{:.2}ms", secs * 1000.0)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,10 @@ use routing_table::RoutingTableHealth;
|
|||
|
||||
impl_veilid_log_facility!("attach");
|
||||
|
||||
const TICK_INTERVAL_MSEC: u32 = 1000;
|
||||
const ATTACHMENT_MAINTAINER_INTERVAL_MSEC: u32 = 1000;
|
||||
const BIND_WAIT_DELAY_MSEC: u32 = 10000;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AttachmentManagerStartupContext {
|
||||
pub startup_lock: Arc<StartupLock>,
|
||||
|
@ -20,21 +24,56 @@ impl Default for AttachmentManagerStartupContext {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AttachmentManagerInner {
|
||||
last_attachment_state: AttachmentState,
|
||||
last_routing_table_health: Option<Arc<RoutingTableHealth>>,
|
||||
maintain_peers: bool,
|
||||
started_ts: Timestamp,
|
||||
attach_ts: Option<Timestamp>,
|
||||
attachment_maintainer_jh: Option<MustJoinHandle<()>>,
|
||||
/// Event sent every second while veilid-core is initialized
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
|
||||
pub struct TickEvent {
|
||||
pub last_tick_ts: Option<Timestamp>,
|
||||
pub cur_tick_ts: Timestamp,
|
||||
}
|
||||
|
||||
struct AttachmentManagerInner {
|
||||
attachment_state: AttachmentState,
|
||||
last_routing_table_health: Option<Arc<RoutingTableHealth>>,
|
||||
maintain_peers: bool,
|
||||
attach_enabled: bool,
|
||||
started_ts: Timestamp,
|
||||
attach_ts: Option<Timestamp>,
|
||||
last_tick_ts: Option<Timestamp>,
|
||||
tick_future: Option<PinBoxFutureStatic<()>>,
|
||||
eventual_termination: Option<EventualValue<()>>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for AttachmentManagerInner {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("AttachmentManagerInner")
|
||||
.field("attachment_state", &self.attachment_state)
|
||||
.field("last_routing_table_health", &self.last_routing_table_health)
|
||||
.field("maintain_peers", &self.maintain_peers)
|
||||
.field("attach_enabled", &self.attach_enabled)
|
||||
.field("started_ts", &self.started_ts)
|
||||
.field("attach_ts", &self.attach_ts)
|
||||
.field("last_tick_ts", &self.last_tick_ts)
|
||||
.field("eventual_termination", &self.eventual_termination)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AttachmentManager {
|
||||
registry: VeilidComponentRegistry,
|
||||
inner: Mutex<AttachmentManagerInner>,
|
||||
startup_context: AttachmentManagerStartupContext,
|
||||
attachment_maintainer_task: TickTask<EyreReport>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for AttachmentManager {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("AttachmentManager")
|
||||
// .field("registry", &self.registry)
|
||||
.field("inner", &self.inner)
|
||||
.field("startup_context", &self.startup_context)
|
||||
// .field("attachment_maintainer_task", &self.attachment_maintainer_task)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl_veilid_component!(AttachmentManager);
|
||||
|
@ -42,12 +81,15 @@ impl_veilid_component!(AttachmentManager);
|
|||
impl AttachmentManager {
|
||||
fn new_inner() -> AttachmentManagerInner {
|
||||
AttachmentManagerInner {
|
||||
last_attachment_state: AttachmentState::Detached,
|
||||
attachment_state: AttachmentState::Detached,
|
||||
last_routing_table_health: None,
|
||||
maintain_peers: false,
|
||||
attach_enabled: false,
|
||||
started_ts: Timestamp::now(),
|
||||
attach_ts: None,
|
||||
attachment_maintainer_jh: None,
|
||||
last_tick_ts: None,
|
||||
tick_future: None,
|
||||
eventual_termination: None,
|
||||
}
|
||||
}
|
||||
pub fn new(
|
||||
|
@ -58,18 +100,20 @@ impl AttachmentManager {
|
|||
registry,
|
||||
inner: Mutex::new(Self::new_inner()),
|
||||
startup_context,
|
||||
attachment_maintainer_task: TickTask::new_ms(
|
||||
"attachment_maintainer_task",
|
||||
ATTACHMENT_MAINTAINER_INTERVAL_MSEC,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_attached(&self) -> bool {
|
||||
let s = self.inner.lock().last_attachment_state;
|
||||
!matches!(s, AttachmentState::Detached | AttachmentState::Detaching)
|
||||
self.inner.lock().attachment_state.is_attached()
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn is_detached(&self) -> bool {
|
||||
let s = self.inner.lock().last_attachment_state;
|
||||
matches!(s, AttachmentState::Detached)
|
||||
self.inner.lock().attachment_state.is_detached()
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
|
@ -77,128 +121,242 @@ impl AttachmentManager {
|
|||
self.inner.lock().attach_ts
|
||||
}
|
||||
|
||||
fn translate_routing_table_health(
|
||||
health: &RoutingTableHealth,
|
||||
config: &VeilidConfigRoutingTable,
|
||||
) -> AttachmentState {
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_over_attached).unwrap()
|
||||
{
|
||||
return AttachmentState::OverAttached;
|
||||
}
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_fully_attached).unwrap()
|
||||
{
|
||||
return AttachmentState::FullyAttached;
|
||||
}
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_strong).unwrap()
|
||||
{
|
||||
return AttachmentState::AttachedStrong;
|
||||
}
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_good).unwrap()
|
||||
{
|
||||
return AttachmentState::AttachedGood;
|
||||
}
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_weak).unwrap()
|
||||
|| health.unreliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_weak).unwrap()
|
||||
{
|
||||
return AttachmentState::AttachedWeak;
|
||||
}
|
||||
AttachmentState::Attaching
|
||||
#[instrument(level = "debug", skip_all, err)]
|
||||
pub async fn init_async(&self) -> EyreResult<()> {
|
||||
let guard = self.startup_context.startup_lock.startup()?;
|
||||
guard.success();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update attachment and network readiness state
|
||||
/// and possibly send a VeilidUpdate::Attachment.
|
||||
fn update_attachment(&self) {
|
||||
// update the routing table health
|
||||
let routing_table = self.network_manager().routing_table();
|
||||
let health = routing_table.get_routing_table_health();
|
||||
let opt_update = {
|
||||
let mut inner = self.inner.lock();
|
||||
#[instrument(level = "debug", skip_all, err)]
|
||||
pub async fn post_init_async(&self) -> EyreResult<()> {
|
||||
let registry = self.registry();
|
||||
|
||||
// Check if the routing table health is different
|
||||
if let Some(last_routing_table_health) = &inner.last_routing_table_health {
|
||||
// If things are the same, just return
|
||||
if last_routing_table_health.as_ref() == &health {
|
||||
return;
|
||||
veilid_log!(self debug "starting attachment maintainer task");
|
||||
impl_setup_task!(
|
||||
self,
|
||||
Self,
|
||||
attachment_maintainer_task,
|
||||
attachment_maintainer_task_routine
|
||||
);
|
||||
|
||||
// Create top level tick interval
|
||||
let tick_future = interval(
|
||||
"attachment maintainer tick",
|
||||
TICK_INTERVAL_MSEC,
|
||||
move || {
|
||||
let registry = registry.clone();
|
||||
async move {
|
||||
let this = registry.attachment_manager();
|
||||
if let Err(e) = this.tick().await {
|
||||
veilid_log!(this warn "attachment maintainer tick failed: {}", e);
|
||||
}
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
inner.tick_future = Some(tick_future);
|
||||
|
||||
// Enable attachment now
|
||||
inner.attach_enabled = true;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all)]
|
||||
pub async fn pre_terminate_async(&self) {
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
// Disable attachment now
|
||||
// Will cause attachment maintainer to drive the state toward 'Detached'
|
||||
inner.attach_enabled = false;
|
||||
}
|
||||
|
||||
// Wait for detached state
|
||||
while !matches!(
|
||||
self.inner.lock().attachment_state,
|
||||
AttachmentState::Detached
|
||||
) {
|
||||
sleep(500).await;
|
||||
}
|
||||
|
||||
// Stop ticker
|
||||
let tick_future = self.inner.lock().tick_future.take();
|
||||
if let Some(tick_future) = tick_future {
|
||||
tick_future.await;
|
||||
}
|
||||
|
||||
// Stop background operations
|
||||
veilid_log!(self debug "stopping attachment maintainer task");
|
||||
if let Err(e) = self.attachment_maintainer_task.stop().await {
|
||||
veilid_log!(self warn "attachment_maintainer not stopped: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all)]
|
||||
pub async fn terminate_async(&self) {
|
||||
let guard = self
|
||||
.startup_context
|
||||
.startup_lock
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("should be initialized");
|
||||
|
||||
// Shutdown successful
|
||||
guard.success();
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub async fn attach(&self) -> bool {
|
||||
let Ok(_guard) = self.startup_context.startup_lock.enter() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let mut inner = self.inner.lock();
|
||||
// If attaching is disabled (because we are terminating)
|
||||
// then just return now
|
||||
if !inner.attach_enabled {
|
||||
return false;
|
||||
}
|
||||
let previous = inner.maintain_peers;
|
||||
inner.maintain_peers = true;
|
||||
|
||||
previous != inner.maintain_peers
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub async fn detach(&self) -> bool {
|
||||
let Ok(_guard) = self.startup_context.startup_lock.enter() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
let previous = inner.maintain_peers;
|
||||
if !previous {
|
||||
// Already detached or detaching
|
||||
return false;
|
||||
}
|
||||
// Wants to be detached
|
||||
inner.maintain_peers = false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
async fn tick(&self) -> EyreResult<()> {
|
||||
let cur_tick_ts = Timestamp::now();
|
||||
let last_tick_ts = {
|
||||
let mut inner = self.inner.lock();
|
||||
let last_tick_ts = inner.last_tick_ts;
|
||||
inner.last_tick_ts = Some(cur_tick_ts);
|
||||
last_tick_ts
|
||||
};
|
||||
|
||||
// Log if we're seeing missed ticks
|
||||
if let Some(lag) = last_tick_ts.map(|x| cur_tick_ts.saturating_sub(x)) {
|
||||
if lag > TimestampDuration::new_ms(2 * (TICK_INTERVAL_MSEC as u64)) {
|
||||
veilid_log!(self debug "tick lag: {}", lag);
|
||||
}
|
||||
}
|
||||
|
||||
// Tick our own ticktask for the attachment maintainer state machine
|
||||
self.attachment_maintainer_task.tick().await?;
|
||||
|
||||
// Send a 'tick' event for the rest of the system to get ticks
|
||||
let event_bus = self.event_bus();
|
||||
event_bus.post(TickEvent {
|
||||
last_tick_ts,
|
||||
cur_tick_ts,
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Manage attachment state
|
||||
#[instrument(level = "trace", target = "stor", skip_all, err)]
|
||||
async fn attachment_maintainer_task_routine(
|
||||
&self,
|
||||
_stop_token: StopToken,
|
||||
_last_ts: Timestamp,
|
||||
_cur_ts: Timestamp,
|
||||
) -> EyreResult<()> {
|
||||
let (state, maintain_peers, attach_enabled) = {
|
||||
let inner = self.inner.lock();
|
||||
(
|
||||
inner.attachment_state,
|
||||
inner.maintain_peers,
|
||||
inner.attach_enabled,
|
||||
)
|
||||
};
|
||||
|
||||
let next_state = match state {
|
||||
AttachmentState::Detached => {
|
||||
if maintain_peers && attach_enabled {
|
||||
veilid_log!(self debug "attachment starting");
|
||||
|
||||
match self.startup().await {
|
||||
Err(err) => {
|
||||
error!("attachment startup failed: {}", err);
|
||||
None
|
||||
}
|
||||
Ok(StartupDisposition::BindRetry) => {
|
||||
veilid_log!(self info "waiting for network to bind...");
|
||||
sleep(BIND_WAIT_DELAY_MSEC).await;
|
||||
None
|
||||
}
|
||||
Ok(StartupDisposition::Success) => {
|
||||
veilid_log!(self debug "started maintaining peers");
|
||||
|
||||
self.update_non_attached_state(AttachmentState::Attaching);
|
||||
Some(AttachmentState::Attaching)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
AttachmentState::Attaching
|
||||
| AttachmentState::AttachedWeak
|
||||
| AttachmentState::AttachedGood
|
||||
| AttachmentState::AttachedStrong
|
||||
| AttachmentState::FullyAttached
|
||||
| AttachmentState::OverAttached => {
|
||||
if maintain_peers && attach_enabled {
|
||||
let network_manager = self.network_manager();
|
||||
if network_manager.network_needs_restart() {
|
||||
veilid_log!(self info "Restarting network");
|
||||
self.update_non_attached_state(AttachmentState::Detaching);
|
||||
Some(AttachmentState::Detaching)
|
||||
} else {
|
||||
self.update_attached_state(state)
|
||||
}
|
||||
} else {
|
||||
veilid_log!(self debug "stopped maintaining peers");
|
||||
Some(AttachmentState::Detaching)
|
||||
}
|
||||
}
|
||||
AttachmentState::Detaching => {
|
||||
veilid_log!(self debug "shutting down attachment");
|
||||
self.shutdown().await;
|
||||
|
||||
// Swap in new health numbers
|
||||
let opt_previous_health = inner.last_routing_table_health.take();
|
||||
inner.last_routing_table_health = Some(Arc::new(health.clone()));
|
||||
|
||||
// Calculate new attachment state
|
||||
let config = self.config();
|
||||
let routing_table_config = &config.get().network.routing_table;
|
||||
let previous_attachment_state = inner.last_attachment_state;
|
||||
inner.last_attachment_state =
|
||||
AttachmentManager::translate_routing_table_health(&health, routing_table_config);
|
||||
|
||||
// Send update if one of:
|
||||
// * the attachment state has changed
|
||||
// * routing domain readiness has changed
|
||||
// * this is our first routing table health check
|
||||
let send_update = previous_attachment_state != inner.last_attachment_state
|
||||
|| opt_previous_health
|
||||
.map(|x| {
|
||||
x.public_internet_ready != health.public_internet_ready
|
||||
|| x.local_network_ready != health.local_network_ready
|
||||
})
|
||||
.unwrap_or(true);
|
||||
if send_update {
|
||||
Some(Self::get_veilid_state_inner(&inner))
|
||||
} else {
|
||||
None
|
||||
self.update_non_attached_state(AttachmentState::Detached);
|
||||
Some(AttachmentState::Detached)
|
||||
}
|
||||
};
|
||||
|
||||
// Send the update outside of the lock
|
||||
if let Some(update) = opt_update {
|
||||
(self.update_callback())(VeilidUpdate::Attachment(update));
|
||||
}
|
||||
}
|
||||
|
||||
fn update_attaching_detaching_state(&self, state: AttachmentState) {
|
||||
let uptime;
|
||||
let attached_uptime;
|
||||
{
|
||||
// Transition to next state
|
||||
if let Some(next_state) = next_state {
|
||||
let mut inner = self.inner.lock();
|
||||
inner.attachment_state = next_state;
|
||||
}
|
||||
|
||||
// Clear routing table health so when we start measuring it we start from scratch
|
||||
inner.last_routing_table_health = None;
|
||||
|
||||
// Set attachment state directly
|
||||
inner.last_attachment_state = state;
|
||||
|
||||
// Set timestamps
|
||||
if state == AttachmentState::Attaching {
|
||||
inner.attach_ts = Some(Timestamp::now());
|
||||
} else if state == AttachmentState::Detached {
|
||||
inner.attach_ts = None;
|
||||
} else if state == AttachmentState::Detaching {
|
||||
// ok
|
||||
} else {
|
||||
unreachable!("don't use this for attached states, use update_attachment()");
|
||||
}
|
||||
|
||||
let now = Timestamp::now();
|
||||
uptime = now - inner.started_ts;
|
||||
attached_uptime = inner.attach_ts.map(|ts| now - ts);
|
||||
};
|
||||
|
||||
// Send update
|
||||
(self.update_callback())(VeilidUpdate::Attachment(Box::new(VeilidStateAttachment {
|
||||
state,
|
||||
public_internet_ready: false,
|
||||
local_network_ready: false,
|
||||
uptime,
|
||||
attached_uptime,
|
||||
})))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn startup(&self) -> EyreResult<StartupDisposition> {
|
||||
|
@ -257,174 +415,136 @@ impl AttachmentManager {
|
|||
network_manager.send_network_update();
|
||||
}
|
||||
|
||||
async fn tick(&self) -> EyreResult<()> {
|
||||
// Run the network manager tick
|
||||
let network_manager = self.network_manager();
|
||||
network_manager.tick().await?;
|
||||
|
||||
// Run the routing table tick
|
||||
let routing_table = self.routing_table();
|
||||
routing_table.tick().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(parent = None, level = "debug", skip_all)]
|
||||
async fn attachment_maintainer(&self) {
|
||||
veilid_log!(self debug "attachment starting");
|
||||
self.update_attaching_detaching_state(AttachmentState::Attaching);
|
||||
|
||||
let network_manager = self.network_manager();
|
||||
|
||||
let mut restart;
|
||||
let mut restart_delay;
|
||||
while self.inner.lock().maintain_peers {
|
||||
restart = false;
|
||||
restart_delay = 1;
|
||||
|
||||
match self.startup().await {
|
||||
Err(err) => {
|
||||
error!("attachment startup failed: {}", err);
|
||||
restart = true;
|
||||
}
|
||||
Ok(StartupDisposition::BindRetry) => {
|
||||
veilid_log!(self info "waiting for network to bind...");
|
||||
restart = true;
|
||||
restart_delay = 10;
|
||||
}
|
||||
Ok(StartupDisposition::Success) => {
|
||||
veilid_log!(self debug "started maintaining peers");
|
||||
|
||||
while self.inner.lock().maintain_peers {
|
||||
// tick network manager
|
||||
let next_tick_ts = get_timestamp() + 1_000_000u64;
|
||||
if let Err(err) = self.tick().await {
|
||||
error!("Error in attachment tick: {}", err);
|
||||
self.inner.lock().maintain_peers = false;
|
||||
restart = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// see if we need to restart the network
|
||||
if network_manager.network_needs_restart() {
|
||||
veilid_log!(self info "Restarting network");
|
||||
restart = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Update attachment and network readiness state
|
||||
// and possibly send a VeilidUpdate::Attachment
|
||||
self.update_attachment();
|
||||
|
||||
// sleep should be at the end in case maintain_peers changes state
|
||||
let wait_duration = next_tick_ts
|
||||
.saturating_sub(get_timestamp())
|
||||
.clamp(0, 1_000_000u64);
|
||||
sleep((wait_duration / 1_000) as u32).await;
|
||||
}
|
||||
veilid_log!(self debug "stopped maintaining peers");
|
||||
|
||||
if !restart {
|
||||
self.update_attaching_detaching_state(AttachmentState::Detaching);
|
||||
veilid_log!(self debug "attachment stopping");
|
||||
}
|
||||
|
||||
veilid_log!(self debug "shutting down attachment");
|
||||
self.shutdown().await;
|
||||
}
|
||||
}
|
||||
|
||||
if !restart {
|
||||
break;
|
||||
}
|
||||
|
||||
veilid_log!(self debug "completely restarting attachment");
|
||||
|
||||
// chill out for a second first, give network stack time to settle out
|
||||
for _ in 0..restart_delay {
|
||||
if !self.inner.lock().maintain_peers {
|
||||
break;
|
||||
}
|
||||
sleep(1000).await;
|
||||
}
|
||||
fn translate_routing_table_health(
|
||||
health: &RoutingTableHealth,
|
||||
config: &VeilidConfigRoutingTable,
|
||||
) -> AttachmentState {
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_over_attached).unwrap()
|
||||
{
|
||||
return AttachmentState::OverAttached;
|
||||
}
|
||||
|
||||
self.update_attaching_detaching_state(AttachmentState::Detached);
|
||||
veilid_log!(self debug "attachment stopped");
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all, err)]
|
||||
pub async fn init_async(&self) -> EyreResult<()> {
|
||||
let guard = self.startup_context.startup_lock.startup()?;
|
||||
guard.success();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all, err)]
|
||||
pub async fn post_init_async(&self) -> EyreResult<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all)]
|
||||
pub async fn pre_terminate_async(&self) {
|
||||
// Ensure we detached
|
||||
self.detach().await;
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all)]
|
||||
pub async fn terminate_async(&self) {
|
||||
let guard = self
|
||||
.startup_context
|
||||
.startup_lock
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("should be initialized");
|
||||
|
||||
// Shutdown successful
|
||||
guard.success();
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub async fn attach(&self) -> bool {
|
||||
let Ok(_guard) = self.startup_context.startup_lock.enter() else {
|
||||
return false;
|
||||
};
|
||||
// Create long-running connection maintenance routine
|
||||
let mut inner = self.inner.lock();
|
||||
if inner.attachment_maintainer_jh.is_some() {
|
||||
return false;
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_fully_attached).unwrap()
|
||||
{
|
||||
return AttachmentState::FullyAttached;
|
||||
}
|
||||
inner.maintain_peers = true;
|
||||
let registry = self.registry();
|
||||
inner.attachment_maintainer_jh = Some(spawn("attachment maintainer", async move {
|
||||
let this = registry.attachment_manager();
|
||||
this.attachment_maintainer().await;
|
||||
}));
|
||||
|
||||
true
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_strong).unwrap()
|
||||
{
|
||||
return AttachmentState::AttachedStrong;
|
||||
}
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_good).unwrap()
|
||||
{
|
||||
return AttachmentState::AttachedGood;
|
||||
}
|
||||
if health.reliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_weak).unwrap()
|
||||
|| health.unreliable_entry_count
|
||||
>= TryInto::<usize>::try_into(config.limit_attached_weak).unwrap()
|
||||
{
|
||||
return AttachmentState::AttachedWeak;
|
||||
}
|
||||
AttachmentState::Attaching
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub async fn detach(&self) -> bool {
|
||||
let Ok(_guard) = self.startup_context.startup_lock.enter() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let attachment_maintainer_jh = {
|
||||
/// Update attachment and network readiness state
|
||||
/// and possibly send a VeilidUpdate::Attachment.
|
||||
fn update_attached_state(
|
||||
&self,
|
||||
current_attachment_state: AttachmentState,
|
||||
) -> Option<AttachmentState> {
|
||||
// update the routing table health
|
||||
let routing_table = self.network_manager().routing_table();
|
||||
let health = routing_table.get_routing_table_health();
|
||||
let (opt_update, opt_next_attachment_state) = {
|
||||
let mut inner = self.inner.lock();
|
||||
let attachment_maintainer_jh = inner.attachment_maintainer_jh.take();
|
||||
if attachment_maintainer_jh.is_some() {
|
||||
// Terminate long-running connection maintenance routine
|
||||
inner.maintain_peers = false;
|
||||
|
||||
// Check if the routing table health is different
|
||||
if let Some(last_routing_table_health) = &inner.last_routing_table_health {
|
||||
// If things are the same, just return
|
||||
if last_routing_table_health.as_ref() == &health {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
attachment_maintainer_jh
|
||||
|
||||
// Swap in new health numbers
|
||||
let opt_previous_health = inner.last_routing_table_health.take();
|
||||
inner.last_routing_table_health = Some(Arc::new(health.clone()));
|
||||
|
||||
// Calculate new attachment state
|
||||
let config = self.config();
|
||||
let routing_table_config = &config.get().network.routing_table;
|
||||
let next_attachment_state =
|
||||
AttachmentManager::translate_routing_table_health(&health, routing_table_config);
|
||||
|
||||
// Send update if one of:
|
||||
// * the attachment state has changed
|
||||
// * routing domain readiness has changed
|
||||
// * this is our first routing table health check
|
||||
let send_update = current_attachment_state != next_attachment_state
|
||||
|| opt_previous_health
|
||||
.map(|x| {
|
||||
x.public_internet_ready != health.public_internet_ready
|
||||
|| x.local_network_ready != health.local_network_ready
|
||||
})
|
||||
.unwrap_or(true);
|
||||
let opt_update = if send_update {
|
||||
Some(Self::get_veilid_state_inner(&inner))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let opt_next_attachment_state = if current_attachment_state != next_attachment_state {
|
||||
Some(next_attachment_state)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
(opt_update, opt_next_attachment_state)
|
||||
};
|
||||
if let Some(jh) = attachment_maintainer_jh {
|
||||
jh.await;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
|
||||
// Send the update outside of the lock
|
||||
if let Some(update) = opt_update {
|
||||
(self.update_callback())(VeilidUpdate::Attachment(update));
|
||||
}
|
||||
|
||||
opt_next_attachment_state
|
||||
}
|
||||
|
||||
fn update_non_attached_state(&self, current_attachment_state: AttachmentState) {
|
||||
let uptime;
|
||||
let attached_uptime;
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
|
||||
// Clear routing table health so when we start measuring it we start from scratch
|
||||
inner.last_routing_table_health = None;
|
||||
|
||||
// Set timestamps
|
||||
if current_attachment_state == AttachmentState::Attaching {
|
||||
inner.attach_ts = Some(Timestamp::now());
|
||||
} else if current_attachment_state == AttachmentState::Detached {
|
||||
inner.attach_ts = None;
|
||||
} else if current_attachment_state == AttachmentState::Detaching {
|
||||
// ok
|
||||
} else {
|
||||
unreachable!("don't use this for attached states, use update_attached_state()");
|
||||
}
|
||||
|
||||
let now = Timestamp::now();
|
||||
uptime = now - inner.started_ts;
|
||||
attached_uptime = inner.attach_ts.map(|ts| now - ts);
|
||||
};
|
||||
|
||||
// Send update
|
||||
(self.update_callback())(VeilidUpdate::Attachment(Box::new(VeilidStateAttachment {
|
||||
state: current_attachment_state,
|
||||
public_internet_ready: false,
|
||||
local_network_ready: false,
|
||||
uptime,
|
||||
attached_uptime,
|
||||
})))
|
||||
}
|
||||
|
||||
fn get_veilid_state_inner(inner: &AttachmentManagerInner) -> Box<VeilidStateAttachment> {
|
||||
|
@ -433,7 +553,7 @@ impl AttachmentManager {
|
|||
let attached_uptime = inner.attach_ts.map(|ts| now - ts);
|
||||
|
||||
Box::new(VeilidStateAttachment {
|
||||
state: inner.last_attachment_state,
|
||||
state: inner.attachment_state,
|
||||
public_internet_ready: inner
|
||||
.last_routing_table_health
|
||||
.as_ref()
|
||||
|
@ -456,7 +576,7 @@ impl AttachmentManager {
|
|||
|
||||
#[expect(dead_code)]
|
||||
pub fn get_attachment_state(&self) -> AttachmentState {
|
||||
self.inner.lock().last_attachment_state
|
||||
self.inner.lock().attachment_state
|
||||
}
|
||||
|
||||
#[expect(dead_code)]
|
||||
|
|
|
@ -2,7 +2,7 @@ use crate::attachment_manager::{AttachmentManager, AttachmentManagerStartupConte
|
|||
use crate::crypto::Crypto;
|
||||
use crate::logging::*;
|
||||
use crate::network_manager::{NetworkManager, NetworkManagerStartupContext};
|
||||
use crate::routing_table::RoutingTable;
|
||||
use crate::routing_table::{RoutingTable, RoutingTableStartupContext};
|
||||
use crate::rpc_processor::{RPCProcessor, RPCProcessorStartupContext};
|
||||
use crate::storage_manager::StorageManager;
|
||||
use crate::veilid_api::*;
|
||||
|
@ -81,8 +81,8 @@ impl VeilidCoreContext {
|
|||
registry.register(TableStore::new);
|
||||
#[cfg(feature = "unstable-blockstore")]
|
||||
registry.register(BlockStore::new);
|
||||
registry.register_with_context(RoutingTable::new, RoutingTableStartupContext::default());
|
||||
registry.register(StorageManager::new);
|
||||
registry.register(RoutingTable::new);
|
||||
registry
|
||||
.register_with_context(NetworkManager::new, NetworkManagerStartupContext::default());
|
||||
registry.register_with_context(RPCProcessor::new, RPCProcessorStartupContext::default());
|
||||
|
|
|
@ -110,13 +110,13 @@ pub fn veilid_version() -> (u32, u32, u32) {
|
|||
)
|
||||
}
|
||||
|
||||
#[cfg(all(not(docsrs), not(doc)))]
|
||||
#[cfg(not(docsrs))]
|
||||
include!(env!("BOSION_PATH"));
|
||||
|
||||
/// Return the features that were enabled when veilid-core was built.
|
||||
#[must_use]
|
||||
pub fn veilid_features() -> Vec<String> {
|
||||
if cfg!(any(docsrs, doc)) {
|
||||
if cfg!(docsrs) {
|
||||
vec!["default".to_string()]
|
||||
} else {
|
||||
let features = Bosion::CRATE_FEATURES.to_vec();
|
||||
|
|
|
@ -196,6 +196,7 @@ struct NetworkManagerInner {
|
|||
client_allowlist: LruCache<NodeId, ClientAllowlistEntry>,
|
||||
node_contact_method_cache: NodeContactMethodCache,
|
||||
address_check: Option<AddressCheck>,
|
||||
tick_subscription: Option<EventBusSubscription>,
|
||||
peer_info_change_subscription: Option<EventBusSubscription>,
|
||||
socket_address_change_subscription: Option<EventBusSubscription>,
|
||||
|
||||
|
@ -256,6 +257,7 @@ impl NetworkManager {
|
|||
client_allowlist: LruCache::new_unbounded(),
|
||||
node_contact_method_cache: NodeContactMethodCache::new(),
|
||||
address_check: None,
|
||||
tick_subscription: None,
|
||||
peer_info_change_subscription: None,
|
||||
socket_address_change_subscription: None,
|
||||
txt_lookup_cache: LruCache::new(TXT_LOOKUP_CACHE_SIZE),
|
||||
|
@ -394,7 +396,13 @@ impl NetworkManager {
|
|||
}
|
||||
|
||||
#[expect(clippy::unused_async)]
|
||||
async fn pre_terminate_async(&self) {}
|
||||
async fn pre_terminate_async(&self) {
|
||||
// Ensure things have shut down
|
||||
assert!(
|
||||
self.startup_context.startup_lock.is_shut_down(),
|
||||
"should have shut down by now"
|
||||
);
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all)]
|
||||
async fn terminate_async(&self) {}
|
||||
|
@ -423,19 +431,11 @@ impl NetworkManager {
|
|||
// Startup relay workers
|
||||
self.startup_relay_workers()?;
|
||||
|
||||
// Register event handlers
|
||||
let peer_info_change_subscription =
|
||||
impl_subscribe_event_bus!(self, Self, peer_info_change_event_handler);
|
||||
|
||||
let socket_address_change_subscription =
|
||||
impl_subscribe_event_bus!(self, Self, socket_address_change_event_handler);
|
||||
|
||||
// Set up address filter
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
let address_check = AddressCheck::new(net.clone());
|
||||
inner.address_check = Some(address_check);
|
||||
inner.peer_info_change_subscription = Some(peer_info_change_subscription);
|
||||
inner.socket_address_change_subscription = Some(socket_address_change_subscription);
|
||||
}
|
||||
|
||||
// Start network components
|
||||
|
@ -449,6 +449,22 @@ impl NetworkManager {
|
|||
|
||||
receipt_manager.startup()?;
|
||||
|
||||
// Register event handlers
|
||||
let tick_subscription = impl_subscribe_event_bus_async!(self, Self, tick_event_handler);
|
||||
|
||||
let peer_info_change_subscription =
|
||||
impl_subscribe_event_bus!(self, Self, peer_info_change_event_handler);
|
||||
|
||||
let socket_address_change_subscription =
|
||||
impl_subscribe_event_bus!(self, Self, socket_address_change_event_handler);
|
||||
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
inner.tick_subscription = Some(tick_subscription);
|
||||
inner.peer_info_change_subscription = Some(peer_info_change_subscription);
|
||||
inner.socket_address_change_subscription = Some(socket_address_change_subscription);
|
||||
}
|
||||
|
||||
veilid_log!(self trace "NetworkManager::internal_startup end");
|
||||
|
||||
Ok(StartupDisposition::Success)
|
||||
|
@ -479,13 +495,15 @@ impl NetworkManager {
|
|||
// Shutdown event bus subscriptions and address check
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
if let Some(sub) = inner.tick_subscription.take() {
|
||||
self.event_bus().unsubscribe(sub);
|
||||
}
|
||||
if let Some(sub) = inner.socket_address_change_subscription.take() {
|
||||
self.event_bus().unsubscribe(sub);
|
||||
}
|
||||
if let Some(sub) = inner.peer_info_change_subscription.take() {
|
||||
self.event_bus().unsubscribe(sub);
|
||||
}
|
||||
inner.address_check = None;
|
||||
}
|
||||
|
||||
// Shutdown relay workers
|
||||
|
@ -498,6 +516,12 @@ impl NetworkManager {
|
|||
let components = self.components.read().clone();
|
||||
if let Some(components) = components {
|
||||
components.net.shutdown().await;
|
||||
|
||||
{
|
||||
let mut inner = self.inner.lock();
|
||||
inner.address_check = None;
|
||||
}
|
||||
|
||||
components.receipt_manager.shutdown().await;
|
||||
components.connection_manager.shutdown().await;
|
||||
}
|
||||
|
@ -1093,7 +1117,7 @@ impl NetworkManager {
|
|||
if tsbehind.as_u64() != 0 && (ts > ets && ts.saturating_sub(ets) > tsbehind) {
|
||||
veilid_log!(self debug
|
||||
"Timestamp behind: {}ms ({})",
|
||||
timestamp_to_secs(ts.saturating_sub(ets).as_u64()) * 1000f64,
|
||||
timestamp_duration_to_secs(ts.saturating_sub(ets).as_u64()) * 1000f64,
|
||||
flow.remote()
|
||||
);
|
||||
return Ok(false);
|
||||
|
@ -1103,7 +1127,7 @@ impl NetworkManager {
|
|||
if tsahead.as_u64() != 0 && (ts < ets && ets.saturating_sub(ts) > tsahead) {
|
||||
veilid_log!(self debug
|
||||
"Timestamp ahead: {}ms ({})",
|
||||
timestamp_to_secs(ets.saturating_sub(ts).as_u64()) * 1000f64,
|
||||
timestamp_duration_to_secs(ets.saturating_sub(ts).as_u64()) * 1000f64,
|
||||
flow.remote()
|
||||
);
|
||||
return Ok(false);
|
||||
|
|
|
@ -139,12 +139,12 @@ impl Network {
|
|||
|
||||
if let Err(e) = set_tcp_stream_linger(&tcp_stream, Some(core::time::Duration::from_secs(0)))
|
||||
{
|
||||
veilid_log!(self debug "Couldn't set TCP linger: {}", e);
|
||||
veilid_log!(self debug "Couldn't set TCP linger: {} on {:?} -> {:?}", e, peer_addr, local_addr);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Err(e) = tcp_stream.set_nodelay(true) {
|
||||
veilid_log!(self debug "Couldn't set TCP nodelay: {}", e);
|
||||
veilid_log!(self debug "Couldn't set TCP nodelay: {} on {:?} -> {:?}", e, peer_addr, local_addr);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
pub mod rolling_transfers;
|
||||
|
||||
use super::*;
|
||||
use crate::attachment_manager::TickEvent;
|
||||
|
||||
impl NetworkManager {
|
||||
pub fn setup_tasks(&self) {
|
||||
|
@ -28,8 +28,14 @@ impl NetworkManager {
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn tick_event_handler(&self, _evt: Arc<TickEvent>) {
|
||||
if let Err(e) = self.tick().await {
|
||||
error!("Error in network manager tick: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", name = "NetworkManager::tick", skip_all, err)]
|
||||
pub async fn tick(&self) -> EyreResult<()> {
|
||||
async fn tick(&self) -> EyreResult<()> {
|
||||
let net = self.net();
|
||||
let receipt_manager = self.receipt_manager();
|
||||
|
||||
|
|
|
@ -1094,7 +1094,9 @@ impl BucketEntryInner {
|
|||
{
|
||||
format!(
|
||||
"{}s ago",
|
||||
timestamp_to_secs(cur_ts.saturating_sub(first_consecutive_seen_ts).as_u64())
|
||||
timestamp_duration_to_secs(
|
||||
cur_ts.saturating_sub(first_consecutive_seen_ts).as_u64()
|
||||
)
|
||||
)
|
||||
} else {
|
||||
"never".to_owned()
|
||||
|
@ -1102,7 +1104,7 @@ impl BucketEntryInner {
|
|||
let last_seen_ts_str = if let Some(last_seen_ts) = self.peer_stats.rpc_stats.last_seen_ts {
|
||||
format!(
|
||||
"{}s ago",
|
||||
timestamp_to_secs(cur_ts.saturating_sub(last_seen_ts).as_u64())
|
||||
timestamp_duration_to_secs(cur_ts.saturating_sub(last_seen_ts).as_u64())
|
||||
)
|
||||
} else {
|
||||
"never".to_owned()
|
||||
|
|
|
@ -102,11 +102,31 @@ pub struct RecentPeersEntry {
|
|||
pub last_connection: Flow,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RoutingTableStartupContext {
|
||||
pub startup_lock: Arc<StartupLock>,
|
||||
}
|
||||
impl RoutingTableStartupContext {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
startup_lock: Arc::new(StartupLock::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl Default for RoutingTableStartupContext {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) struct RoutingTable {
|
||||
registry: VeilidComponentRegistry,
|
||||
inner: RwLock<RoutingTableInner>,
|
||||
|
||||
// Startup context
|
||||
startup_context: RoutingTableStartupContext,
|
||||
|
||||
/// Node Ids
|
||||
node_ids: RwLock<NodeIdGroup>,
|
||||
/// Route spec store
|
||||
|
@ -141,6 +161,8 @@ pub(crate) struct RoutingTable {
|
|||
relay_management_task: TickTask<EyreReport>,
|
||||
/// Background process to keep private routes up
|
||||
private_route_management_task: TickTask<EyreReport>,
|
||||
/// Tick subscription
|
||||
tick_subscription: Mutex<Option<EventBusSubscription>>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for RoutingTable {
|
||||
|
@ -155,7 +177,10 @@ impl fmt::Debug for RoutingTable {
|
|||
impl_veilid_component!(RoutingTable);
|
||||
|
||||
impl RoutingTable {
|
||||
pub fn new(registry: VeilidComponentRegistry) -> Self {
|
||||
pub fn new(
|
||||
registry: VeilidComponentRegistry,
|
||||
startup_context: RoutingTableStartupContext,
|
||||
) -> Self {
|
||||
let config = registry.config();
|
||||
let c = config.get();
|
||||
let inner = RwLock::new(RoutingTableInner::new(registry.clone()));
|
||||
|
@ -163,6 +188,7 @@ impl RoutingTable {
|
|||
let this = Self {
|
||||
registry,
|
||||
inner,
|
||||
startup_context,
|
||||
node_ids: RwLock::new(NodeIdGroup::new()),
|
||||
route_spec_store,
|
||||
kick_queue: Mutex::new(BTreeSet::default()),
|
||||
|
@ -207,6 +233,7 @@ impl RoutingTable {
|
|||
"private_route_management_task",
|
||||
PRIVATE_ROUTE_MANAGEMENT_INTERVAL_SECS,
|
||||
),
|
||||
tick_subscription: Mutex::new(None),
|
||||
};
|
||||
|
||||
this.setup_tasks();
|
||||
|
@ -267,6 +294,14 @@ impl RoutingTable {
|
|||
|
||||
#[expect(clippy::unused_async)]
|
||||
pub(crate) async fn startup(&self) -> EyreResult<()> {
|
||||
let guard = self.startup_context.startup_lock.startup()?;
|
||||
|
||||
// Register event handlers
|
||||
let tick_subscription = impl_subscribe_event_bus_async!(self, Self, tick_event_handler);
|
||||
|
||||
*self.tick_subscription.lock() = Some(tick_subscription);
|
||||
|
||||
guard.success();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -275,6 +310,17 @@ impl RoutingTable {
|
|||
veilid_log!(self debug "stopping routing table tasks");
|
||||
self.cancel_tasks().await;
|
||||
|
||||
let guard = self
|
||||
.startup_context
|
||||
.startup_lock
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("should be started up");
|
||||
|
||||
if let Some(sub) = self.tick_subscription.lock().take() {
|
||||
self.event_bus().unsubscribe(sub);
|
||||
}
|
||||
|
||||
// Unpublish peer info
|
||||
veilid_log!(self debug "unpublishing peer info");
|
||||
{
|
||||
|
@ -283,10 +329,18 @@ impl RoutingTable {
|
|||
inner.unpublish_peer_info(routing_domain);
|
||||
}
|
||||
}
|
||||
|
||||
guard.success();
|
||||
}
|
||||
|
||||
#[expect(clippy::unused_async)]
|
||||
async fn pre_terminate_async(&self) {}
|
||||
async fn pre_terminate_async(&self) {
|
||||
// Ensure things have shut down
|
||||
assert!(
|
||||
self.startup_context.startup_lock.is_shut_down(),
|
||||
"should have shut down by now"
|
||||
);
|
||||
}
|
||||
|
||||
/// Called to shut down the routing table
|
||||
async fn terminate_async(&self) {
|
||||
|
|
|
@ -8,6 +8,8 @@ pub mod private_route_management;
|
|||
pub mod relay_management;
|
||||
pub mod update_statistics;
|
||||
|
||||
use crate::attachment_manager::TickEvent;
|
||||
|
||||
use super::*;
|
||||
|
||||
impl_veilid_log_facility!("rtab");
|
||||
|
@ -112,6 +114,12 @@ impl RoutingTable {
|
|||
);
|
||||
}
|
||||
|
||||
pub async fn tick_event_handler(&self, _evt: Arc<TickEvent>) {
|
||||
if let Err(e) = self.tick().await {
|
||||
error!("Error in routing table tick: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Ticks about once per second
|
||||
/// to run tick tasks which may run at slower tick rates as configured
|
||||
#[instrument(level = "trace", name = "RoutingTable::tick", skip_all, err)]
|
||||
|
|
|
@ -18,9 +18,9 @@ pub mod mock_registry {
|
|||
registry.register(TableStore::new);
|
||||
registry.register(Crypto::new);
|
||||
registry.register(StorageManager::new);
|
||||
registry.register(RoutingTable::new);
|
||||
let startup_context = NetworkManagerStartupContext::default();
|
||||
registry.register_with_context(NetworkManager::new, startup_context);
|
||||
registry.register_with_context(RoutingTable::new, RoutingTableStartupContext::default());
|
||||
registry
|
||||
.register_with_context(NetworkManager::new, NetworkManagerStartupContext::default());
|
||||
|
||||
registry.init().await.expect("should init");
|
||||
registry.post_init().await.expect("should post init");
|
||||
|
|
|
@ -12,6 +12,8 @@ mod tasks;
|
|||
mod types;
|
||||
mod watch_value;
|
||||
|
||||
use crate::attachment_manager::TickEvent;
|
||||
|
||||
use super::*;
|
||||
|
||||
use hashlink::LinkedHashMap;
|
||||
|
@ -59,6 +61,8 @@ const CHECK_WATCHED_RECORDS_INTERVAL_SECS: u32 = 1;
|
|||
const REHYDRATE_RECORDS_INTERVAL_SECS: u32 = 1;
|
||||
/// Number of rehydration requests to process in parallel
|
||||
const REHYDRATE_BATCH_SIZE: usize = 16;
|
||||
/// Maximum 'offline lag' before we decide to poll for changed watches
|
||||
const CHANGE_INSPECT_LAG_SECS: u32 = 2;
|
||||
/// Table store table for storage manager metadata
|
||||
const STORAGE_MANAGER_METADATA: &str = "storage_manager_metadata";
|
||||
/// Storage manager metadata key name for offline subkey write persistence
|
||||
|
@ -99,10 +103,10 @@ struct StorageManagerInner {
|
|||
pub outbound_watch_manager: OutboundWatchManager,
|
||||
/// Storage manager metadata that is persistent, including copy of offline subkey writes
|
||||
pub metadata_db: Option<TableDB>,
|
||||
/// Background processing task (not part of attachment manager tick tree so it happens when detached too)
|
||||
pub tick_future: Option<PinBoxFutureStatic<()>>,
|
||||
/// PeerInfo subscription
|
||||
peer_info_change_subscription: Option<EventBusSubscription>,
|
||||
/// Peer info change subscription
|
||||
pub peer_info_change_subscription: Option<EventBusSubscription>,
|
||||
/// Tick subscription
|
||||
pub tick_subscription: Option<EventBusSubscription>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for StorageManagerInner {
|
||||
|
@ -121,7 +125,7 @@ impl fmt::Debug for StorageManagerInner {
|
|||
&self.peer_info_change_subscription,
|
||||
)
|
||||
//.field("metadata_db", &self.metadata_db)
|
||||
//.field("tick_future", &self.tick_future)
|
||||
//.field("tick_subscription", &self.tick_subscription)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
@ -322,25 +326,16 @@ impl StorageManager {
|
|||
// Register event handlers
|
||||
let peer_info_change_subscription =
|
||||
impl_subscribe_event_bus_async!(self, Self, peer_info_change_event_handler);
|
||||
let tick_subscription = impl_subscribe_event_bus_async!(self, Self, tick_event_handler);
|
||||
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
// Resolve outbound watch manager noderefs
|
||||
inner.outbound_watch_manager.prepare(self.routing_table());
|
||||
inner.outbound_watch_manager.prepare(&self.routing_table());
|
||||
|
||||
// Schedule tick
|
||||
let registry = self.registry();
|
||||
let tick_future = interval("storage manager tick", 1000, move || {
|
||||
let registry = registry.clone();
|
||||
async move {
|
||||
let this = registry.storage_manager();
|
||||
if let Err(e) = this.tick().await {
|
||||
veilid_log!(this warn "storage manager tick failed: {}", e);
|
||||
}
|
||||
}
|
||||
});
|
||||
inner.tick_future = Some(tick_future);
|
||||
inner.peer_info_change_subscription = Some(peer_info_change_subscription);
|
||||
inner.tick_subscription = Some(tick_subscription);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -350,14 +345,12 @@ impl StorageManager {
|
|||
// Stop background operations
|
||||
{
|
||||
let mut inner = self.inner.lock().await;
|
||||
// Stop ticker
|
||||
let tick_future = inner.tick_future.take();
|
||||
if let Some(f) = tick_future {
|
||||
f.await;
|
||||
}
|
||||
if let Some(sub) = inner.peer_info_change_subscription.take() {
|
||||
self.event_bus().unsubscribe(sub);
|
||||
}
|
||||
if let Some(sub) = inner.tick_subscription.take() {
|
||||
self.event_bus().unsubscribe(sub);
|
||||
}
|
||||
}
|
||||
|
||||
// Cancel all tasks associated with the tick future
|
||||
|
@ -2249,6 +2242,13 @@ impl StorageManager {
|
|||
}
|
||||
}
|
||||
|
||||
async fn tick_event_handler(&self, evt: Arc<TickEvent>) {
|
||||
let lag = evt.last_tick_ts.map(|x| evt.cur_tick_ts.saturating_sub(x));
|
||||
if let Err(e) = self.tick(lag).await {
|
||||
error!("Error in storage manager tick: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_encryption_key_for_opaque_record_key(
|
||||
&self,
|
||||
opaque_record_key: &OpaqueRecordKey,
|
||||
|
|
|
@ -90,7 +90,7 @@ impl OutboundWatchManager {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn prepare(&mut self, routing_table: VeilidComponentGuard<'_, RoutingTable>) {
|
||||
pub fn prepare(&mut self, routing_table: &RoutingTable) {
|
||||
for (pnk, pns) in &mut self.per_node_states {
|
||||
pns.watch_node_ref = match routing_table.lookup_node_ref(pnk.node_id.clone()) {
|
||||
Ok(v) => v,
|
||||
|
@ -204,7 +204,13 @@ impl OutboundWatchManager {
|
|||
}
|
||||
|
||||
/// Set a record up to be inspected for changed subkeys
|
||||
pub fn enqueue_change_inspect(&mut self, record_key: RecordKey, subkeys: ValueSubkeyRangeSet) {
|
||||
pub fn enqueue_change_inspect(
|
||||
&mut self,
|
||||
storage_manager: &StorageManager,
|
||||
record_key: RecordKey,
|
||||
subkeys: ValueSubkeyRangeSet,
|
||||
) {
|
||||
veilid_log!(storage_manager debug "change inspect: record_key={} subkeys={}", record_key, subkeys);
|
||||
self.needs_change_inspection
|
||||
.entry(record_key)
|
||||
.and_modify(|x| *x = x.union(&subkeys))
|
||||
|
|
|
@ -18,9 +18,11 @@ impl StorageManager {
|
|||
flush_record_stores_task,
|
||||
flush_record_stores_task_routine
|
||||
);
|
||||
|
||||
// Set save metadata task
|
||||
veilid_log!(self debug "starting save metadata task");
|
||||
impl_setup_task!(self, Self, save_metadata_task, save_metadata_task_routine);
|
||||
|
||||
// Set offline subkey writes tick task
|
||||
veilid_log!(self debug "starting offline subkey writes task");
|
||||
impl_setup_task!(
|
||||
|
@ -68,7 +70,7 @@ impl StorageManager {
|
|||
}
|
||||
|
||||
#[instrument(parent = None, level = "trace", target = "stor", name = "StorageManager::tick", skip_all, err)]
|
||||
pub async fn tick(&self) -> EyreResult<()> {
|
||||
pub async fn tick(&self, lag: Option<TimestampDuration>) -> EyreResult<()> {
|
||||
// Run the flush stores task
|
||||
self.flush_record_stores_task.tick().await?;
|
||||
|
||||
|
@ -96,6 +98,14 @@ impl StorageManager {
|
|||
// Send value changed notifications
|
||||
self.send_value_changes_task.tick().await?;
|
||||
}
|
||||
|
||||
// Change inspection
|
||||
if let Some(lag) = lag {
|
||||
if lag > TimestampDuration::new_secs(CHANGE_INSPECT_LAG_SECS) {
|
||||
self.change_inspect_all_watches().await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -121,6 +131,10 @@ impl StorageManager {
|
|||
if let Err(e) = self.offline_subkey_writes_task.stop().await {
|
||||
veilid_log!(self warn "offline_subkey_writes_task not stopped: {}", e);
|
||||
}
|
||||
veilid_log!(self debug "stopping save metadata task");
|
||||
if let Err(e) = self.save_metadata_task.stop().await {
|
||||
veilid_log!(self warn "save_metadata_task not stopped: {}", e);
|
||||
}
|
||||
veilid_log!(self debug "stopping record rehydration task");
|
||||
if let Err(e) = self.rehydrate_records_task.stop().await {
|
||||
veilid_log!(self warn "rehydrate_records_task not stopped: {}", e);
|
||||
|
|
|
@ -813,6 +813,8 @@ impl StorageManager {
|
|||
}
|
||||
|
||||
// Update watch state
|
||||
let did_add_nodes = !added_nodes.is_empty();
|
||||
|
||||
state.edit(&inner.outbound_watch_manager.per_node_states, |editor| {
|
||||
editor.set_params(desired.clone());
|
||||
editor.retain_nodes(|x| !remove_nodes.contains(x));
|
||||
|
@ -821,10 +823,12 @@ impl StorageManager {
|
|||
|
||||
// Watch was reconciled, now kick off an inspect to
|
||||
// ensure that any changes online are immediately reported to the app
|
||||
if opt_old_state_params != Some(desired) {
|
||||
// If the watch parameters changed, or we added new nodes to the watch state
|
||||
// then we should inspect and see if anything changed
|
||||
if opt_old_state_params != Some(desired) || did_add_nodes {
|
||||
inner
|
||||
.outbound_watch_manager
|
||||
.enqueue_change_inspect(record_key, watch_subkeys);
|
||||
.enqueue_change_inspect(self, record_key, watch_subkeys);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1057,9 +1061,13 @@ impl StorageManager {
|
|||
inbound_node_id: NodeId,
|
||||
watch_id: u64,
|
||||
) -> VeilidAPIResult<NetworkResult<()>> {
|
||||
let encryption_key = self
|
||||
let Ok(encryption_key) = self
|
||||
.get_encryption_key_for_opaque_record_key(&opaque_record_key)
|
||||
.await?;
|
||||
.await
|
||||
else {
|
||||
// value change received for unopened key
|
||||
return Ok(NetworkResult::value(()));
|
||||
};
|
||||
let record_key = RecordKey::new(
|
||||
opaque_record_key.kind(),
|
||||
BareRecordKey::new(opaque_record_key.value(), encryption_key),
|
||||
|
@ -1299,9 +1307,11 @@ impl StorageManager {
|
|||
// inspect the range to see what changed
|
||||
|
||||
// Queue this up for inspection
|
||||
inner
|
||||
.outbound_watch_manager
|
||||
.enqueue_change_inspect(record_key, reportable_subkeys);
|
||||
inner.outbound_watch_manager.enqueue_change_inspect(
|
||||
self,
|
||||
record_key,
|
||||
reportable_subkeys,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(NetworkResult::value(()))
|
||||
|
@ -1329,9 +1339,11 @@ impl StorageManager {
|
|||
veilid_log!(self debug "change inspecting {} watches", change_inspects.len());
|
||||
|
||||
for change_inspect in change_inspects {
|
||||
inner
|
||||
.outbound_watch_manager
|
||||
.enqueue_change_inspect(change_inspect.0, change_inspect.1);
|
||||
inner.outbound_watch_manager.enqueue_change_inspect(
|
||||
self,
|
||||
change_inspect.0,
|
||||
change_inspect.1,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,12 +22,11 @@ pub fn format_opt_ts(ts: Option<TimestampDuration>) -> String {
|
|||
let Some(ts) = ts else {
|
||||
return "---".to_owned();
|
||||
};
|
||||
let ts = ts.as_u64();
|
||||
let secs = timestamp_to_secs(ts);
|
||||
let secs = ts.seconds_f64();
|
||||
if secs >= 1.0 {
|
||||
format!("{:.2}s", timestamp_to_secs(ts))
|
||||
format!("{:.2}s", secs)
|
||||
} else {
|
||||
format!("{:.2}ms", timestamp_to_secs(ts) * 1000.0)
|
||||
format!("{:.2}ms", secs * 1000.0)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ macro_rules! aligned_u64_type {
|
|||
Self(v)
|
||||
}
|
||||
#[must_use]
|
||||
pub fn as_u64(self) -> u64 {
|
||||
pub const fn as_u64(self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,4 +18,26 @@ impl TimestampDuration {
|
|||
pub const fn new_ms(ms: u64) -> Self {
|
||||
TimestampDuration::new(ms * 1_000u64)
|
||||
}
|
||||
|
||||
pub fn seconds_u32(&self) -> Result<u32, String> {
|
||||
u32::try_from(self.as_u64() / 1_000_000u64)
|
||||
.map_err(|e| format!("could not convert to seconds: {}", e))
|
||||
}
|
||||
|
||||
pub fn millis_u32(&self) -> Result<u32, String> {
|
||||
u32::try_from(self.as_u64() / 1_000u64)
|
||||
.map_err(|e| format!("could not convert to milliseconds: {}", e))
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn seconds_f64(&self) -> f64 {
|
||||
// Downshift precision until it fits, lose least significant bits
|
||||
let mut mul: f64 = 1.0f64 / 1_000_000.0f64;
|
||||
let mut usec = self.0;
|
||||
while usec > (u32::MAX as u64) {
|
||||
usec >>= 1;
|
||||
mul *= 2.0f64;
|
||||
}
|
||||
f64::from(usec as u32) * mul
|
||||
}
|
||||
}
|
||||
|
|
|
@ -534,7 +534,7 @@ pub fn test_tools() {
|
|||
for x in 0..1024 {
|
||||
let cur_us = x as u64 * 1000000u64;
|
||||
if retry_falloff_log(last_us, cur_us, 10_000_000u64, 6_000_000_000u64, 2.0f64) {
|
||||
info!(" retry at {} secs", timestamp_to_secs(cur_us));
|
||||
info!(" retry at {} secs", timestamp_duration_to_secs(cur_us));
|
||||
last_us = cur_us;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ pub struct TickTask<E: Send + 'static> {
|
|||
running: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
impl<E: Send + 'static> TickTask<E> {
|
||||
impl<E: Send + fmt::Debug + 'static> TickTask<E> {
|
||||
#[must_use]
|
||||
pub fn new_us(name: &str, tick_period_us: u64) -> Self {
|
||||
Self {
|
||||
|
@ -179,9 +179,14 @@ impl<E: Send + 'static> TickTask<E> {
|
|||
*stop_source_guard = Some(stop_source);
|
||||
Ok(true)
|
||||
}
|
||||
Err(()) => {
|
||||
// If we get this, it's because we are joining the singlefuture already
|
||||
// Don't bother running but this is not an error in this case
|
||||
Ok(false)
|
||||
}
|
||||
// All other conditions should not be reachable
|
||||
_ => {
|
||||
unreachable!();
|
||||
x => {
|
||||
panic!("should not have gotten this result: {:?}", x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -160,12 +160,19 @@ pub fn prepend_slash(s: String) -> String {
|
|||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn timestamp_to_secs(ts: u64) -> f64 {
|
||||
ts as f64 / 1000000.0f64
|
||||
pub fn timestamp_duration_to_secs(dur: u64) -> f64 {
|
||||
// Downshift precision until it fits, lose least significant bits
|
||||
let mut mul: f64 = 1.0f64 / 1_000_000.0f64;
|
||||
let mut usec = dur;
|
||||
while usec > (u32::MAX as u64) {
|
||||
usec >>= 1;
|
||||
mul *= 2.0f64;
|
||||
}
|
||||
f64::from(usec as u32) * mul
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn secs_to_timestamp(secs: f64) -> u64 {
|
||||
pub fn secs_to_timestamp_duration(secs: f64) -> u64 {
|
||||
(secs * 1000000.0f64) as u64
|
||||
}
|
||||
|
||||
|
@ -196,7 +203,10 @@ pub fn retry_falloff_log(
|
|||
true
|
||||
} else {
|
||||
// Exponential falloff between 'interval_start_us' and 'interval_max_us' microseconds
|
||||
last_us <= secs_to_timestamp(timestamp_to_secs(cur_us) / interval_multiplier_us)
|
||||
last_us
|
||||
<= secs_to_timestamp_duration(
|
||||
timestamp_duration_to_secs(cur_us) / interval_multiplier_us,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue