startup lock

This commit is contained in:
Christien Rioux 2024-07-19 13:00:15 -04:00
parent 20e5df6564
commit 1b34239eb8
23 changed files with 415 additions and 5 deletions

1
Cargo.lock generated
View File

@ -6034,7 +6034,6 @@ version = "0.3.3"
dependencies = [
"argon2",
"async-io 1.13.0",
"async-lock 2.8.0",
"async-std",
"async-std-resolver",
"async-tls",

View File

@ -191,7 +191,6 @@ async_executors = { version = "0.7.0", default-features = false, features = [
"bindgen",
"timer",
] }
async-lock = "2.8.0"
wasm-bindgen = "0.2.92"
js-sys = "0.3.69"
wasm-bindgen-futures = "0.4.42"

View File

@ -1132,7 +1132,13 @@ impl NetworkManager {
source_noderef.merge_filter(NodeRefFilter::new().with_routing_domain(routing_domain));
// Pass message to RPC system
rpc.enqueue_direct_message(envelope, source_noderef, flow, routing_domain, body)?;
if let Err(e) =
rpc.enqueue_direct_message(envelope, source_noderef, flow, routing_domain, body)
{
// Couldn't enqueue, but not the sender's fault
log_net!(debug "failed to enqueue direct message: {}", e);
return Ok(false);
}
// Inform caller that we dealt with the envelope locally
Ok(true)

View File

@ -292,6 +292,7 @@ struct RPCProcessorUnlockedInner {
update_callback: UpdateCallback,
waiting_rpc_table: OperationWaiter<RPCMessage, Option<QuestionContext>>,
waiting_app_call_table: OperationWaiter<Vec<u8>, ()>,
startup_lock: StartupLock,
}
#[derive(Clone)]
@ -345,6 +346,7 @@ impl RPCProcessor {
update_callback,
waiting_rpc_table: OperationWaiter::new(),
waiting_app_call_table: OperationWaiter::new(),
startup_lock: StartupLock::new(),
}
}
pub fn new(network_manager: NetworkManager, update_callback: UpdateCallback) -> Self {
@ -377,6 +379,7 @@ impl RPCProcessor {
#[instrument(level = "debug", skip_all, err)]
pub async fn startup(&self) -> EyreResult<()> {
log_rpc!(debug "startup rpc processor");
let guard = self.unlocked_inner.startup_lock.startup()?;
{
let mut inner = self.inner.lock();
@ -406,12 +409,17 @@ impl RPCProcessor {
.set_rpc_processor(Some(self.clone()))
.await;
guard.success();
Ok(())
}
#[instrument(level = "debug", skip_all)]
pub async fn shutdown(&self) {
log_rpc!(debug "starting rpc processor shutdown");
let Ok(guard) = self.unlocked_inner.startup_lock.shutdown().await else {
log_rpc!(debug "rpc processor already shut down");
return;
};
// Stop storage manager from using us
self.storage_manager.set_rpc_processor(None).await;
@ -437,6 +445,7 @@ impl RPCProcessor {
// Release the rpc processor
*self.inner.lock() = Self::new_inner();
guard.success();
log_rpc!(debug "finished rpc processor shutdown");
}
@ -539,6 +548,8 @@ impl RPCProcessor {
) -> SendPinBoxFuture<Result<Option<NodeRef>, RPCError>> {
let this = self.clone();
Box::pin(async move {
let _guard = this.unlocked_inner.startup_lock.enter().map_err(RPCError::map_try_again("not started up"))?;
let routing_table = this.routing_table();
// First see if we have the node in our routing table already
@ -1699,6 +1710,8 @@ impl RPCProcessor {
routing_domain: RoutingDomain,
body: Vec<u8>,
) -> EyreResult<()> {
let _guard = self.unlocked_inner.startup_lock.enter().map_err(RPCError::map_try_again("not started up"))?;
let header = RPCMessageHeader {
detail: RPCMessageHeaderDetail::Direct(RPCMessageHeaderDetailDirect {
envelope,

View File

@ -9,6 +9,12 @@ impl RPCProcessor {
dest: Destination,
message: Vec<u8>,
) -> RPCNetworkResult<Answer<Vec<u8>>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
let debug_string = format!("AppCall(message(len)={}) => {}", message.len(), dest);
let app_call_q = RPCOperationAppCallQ::new(message)?;
@ -152,6 +158,11 @@ impl RPCProcessor {
call_id: OperationId,
message: Vec<u8>,
) -> Result<(), RPCError> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
self.unlocked_inner
.waiting_app_call_table
.complete_op_waiter(call_id, message)

View File

@ -9,6 +9,12 @@ impl RPCProcessor {
dest: Destination,
message: Vec<u8>,
) -> RPCNetworkResult<()> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
let app_message = RPCOperationAppMessage::new(message)?;
let statement = RPCStatement::new(RPCStatementDetail::AppMessage(Box::new(app_message)));

View File

@ -14,6 +14,12 @@ impl RPCProcessor {
node_id: TypedKey,
capabilities: Vec<Capability>,
) -> RPCNetworkResult<Answer<Vec<PeerInfo>>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination never has a private route
if matches!(
dest,

View File

@ -30,6 +30,12 @@ impl RPCProcessor {
subkey: ValueSubkey,
last_descriptor: Option<SignedValueDescriptor>,
) ->RPCNetworkResult<Answer<GetValueAnswer>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination never has a private route
// and get the target noderef so we can validate the response
let Some(target) = dest.node() else {

View File

@ -32,6 +32,12 @@ impl RPCProcessor {
subkeys: ValueSubkeyRangeSet,
last_descriptor: Option<SignedValueDescriptor>,
) -> RPCNetworkResult<Answer<InspectValueAnswer>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination never has a private route
// and get the target noderef so we can validate the response
let Some(target) = dest.node() else {

View File

@ -9,6 +9,12 @@ impl RPCProcessor {
dest: Destination,
receipt: D,
) -> RPCNetworkResult<()> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
let receipt = receipt.as_ref().to_vec();
let return_receipt = RPCOperationReturnReceipt::new(receipt)?;

View File

@ -34,6 +34,12 @@ impl RPCProcessor {
descriptor: SignedValueDescriptor,
send_descriptor: bool,
) ->RPCNetworkResult<Answer<SetValueAnswer>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination never has a private route
// and get the target noderef so we can validate the response
let Some(target) = dest.node() else {

View File

@ -9,6 +9,12 @@ impl RPCProcessor {
dest: Destination,
signal_info: SignalInfo,
) -> RPCNetworkResult<()> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination never has a private route
if matches!(
dest,

View File

@ -20,6 +20,12 @@ impl RPCProcessor {
self,
dest: Destination,
) -> RPCNetworkResult<Answer<Option<SenderInfo>>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Determine routing domain and node status to send
let (opt_target_nr, routing_domain, node_status) = if let Some(UnsafeRoutingInfo {
opt_node,

View File

@ -10,6 +10,12 @@ impl RPCProcessor {
dial_info: DialInfo,
redirect: bool,
) -> Result<bool, RPCError> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
let network_manager = self.network_manager();
let receipt_time = ms_to_us(self.unlocked_inner.validate_dial_info_receipt_time_ms);

View File

@ -13,6 +13,12 @@ impl RPCProcessor {
watch_id: u64,
value: Option<SignedValueData>,
) -> RPCNetworkResult<()> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination is never using a safety route
if matches!(dest.get_safety_selection(), SafetySelection::Safe(_)) {
return Err(RPCError::internal(

View File

@ -32,6 +32,12 @@ impl RPCProcessor {
watcher: KeyPair,
watch_id: Option<u64>,
) -> RPCNetworkResult<Answer<WatchValueAnswer>> {
let _guard = self
.unlocked_inner
.startup_lock
.enter()
.map_err(RPCError::map_try_again("not started up"))?;
// Ensure destination never has a private route
// and get the target noderef so we can validate the response
let Some(target) = dest.node() else {

View File

@ -49,6 +49,7 @@ pub mod single_shot_eventual;
pub mod sleep;
pub mod spawn;
pub mod split_url;
pub mod startup_lock;
pub mod tick_task;
pub mod timeout;
pub mod timeout_or;
@ -124,8 +125,17 @@ cfg_if! {
pub use async_lock::MutexGuard as AsyncMutexGuard;
#[doc(no_inline)]
pub use async_lock::MutexGuardArc as AsyncMutexGuardArc;
#[doc(no_inline)]
pub use async_lock::RwLock as AsyncRwLock;
#[doc(no_inline)]
pub use async_lock::RwLockReadGuard as AsyncRwLockReadGuard;
#[doc(no_inline)]
pub use async_lock::RwLockWriteGuard as AsyncRwLockWriteGuard;
#[doc(no_inline)]
pub use async_executors::JoinHandle as LowLevelJoinHandle;
} else {
cfg_if! {
if #[cfg(feature="rt-async-std")] {
@ -135,8 +145,17 @@ cfg_if! {
pub use async_std::sync::MutexGuard as AsyncMutexGuard;
#[doc(no_inline)]
pub use async_std::sync::MutexGuardArc as AsyncMutexGuardArc;
#[doc(no_inline)]
pub use async_std::sync::RwLock as AsyncRwLock;
#[doc(no_inline)]
pub use async_std::sync::RwLockReadGuard as AsyncRwLockReadGuard;
#[doc(no_inline)]
pub use async_std::sync::RwLockWriteGuard as AsyncRwLockWriteGuard;
#[doc(no_inline)]
pub use async_std::task::JoinHandle as LowLevelJoinHandle;
} else if #[cfg(feature="rt-tokio")] {
#[doc(no_inline)]
pub use tokio::sync::Mutex as AsyncMutex;
@ -144,6 +163,15 @@ cfg_if! {
pub use tokio::sync::MutexGuard as AsyncMutexGuard;
#[doc(no_inline)]
pub use tokio::sync::OwnedMutexGuard as AsyncMutexGuardArc;
#[doc(no_inline)]
pub use tokio::sync::RwLock as AsyncRwLock;
#[doc(no_inline)]
pub use tokio::sync::RwLockReadGuard as AsyncRwLockReadGuard;
#[doc(no_inline)]
pub use tokio::sync::RwLockWriteGuard as AsyncRwLockWriteGuard;
#[doc(no_inline)]
pub use tokio::task::JoinHandle as LowLevelJoinHandle;
} else {
@ -202,6 +230,8 @@ pub use spawn::*;
#[doc(inline)]
pub use split_url::*;
#[doc(inline)]
pub use startup_lock::*;
#[doc(inline)]
pub use tick_task::*;
#[doc(inline)]
pub use timeout::*;

View File

@ -0,0 +1,120 @@
use super::*;
#[derive(ThisError, Debug, Copy, Clone, PartialEq, Eq)]
#[error("Already started")]
pub struct StartupLockAlreadyStartedError;
#[derive(ThisError, Debug, Copy, Clone, PartialEq, Eq)]
#[error("Already shut down")]
pub struct StartupLockAlreadyShutDownError;
#[derive(ThisError, Debug, Copy, Clone, PartialEq, Eq)]
#[error("Not started")]
pub struct StartupLockNotStartedError;
/// RAII-style lock for startup and shutdown operations
/// Must call 'success()' on this lock to report a successful startup or shutdown
/// Dropping this lock without calling 'success()' first indicates a failed
/// startup or shutdown operation
#[derive(Debug)]
pub struct StartupLockGuard<'a> {
guard: AsyncRwLockWriteGuard<'a, bool>,
success_value: bool,
}
impl<'a> StartupLockGuard<'a> {
/// Call this function at the end of a successful startup or shutdown
/// operation to switch the state of the StartupLock.
pub fn success(mut self) {
*self.guard = self.success_value;
}
}
/// RAII-style lock for entry operations on a started-up region of code.
#[derive(Debug)]
pub struct StartupLockEnterGuard<'a> {
_guard: AsyncRwLockReadGuard<'a, bool>,
}
/// Synchronization mechanism that tracks the startup and shutdown of a region of code.
/// Guarantees that some code can only be started up once and shut down only if it is
/// already started.
/// Also tracks if the code is in-use and will wait for all 'entered' code to finish
/// before shutting down. Once a shutdown is requested, future calls to 'enter' will
/// fail, ensuring that nothing is 'entered' at the time of shutdown. This allows an
/// asynchronous shutdown to wait for operations to finish before proceeding.
#[derive(Debug)]
pub struct StartupLock {
rwlock: AsyncRwLock<bool>,
}
impl StartupLock {
pub fn new() -> Self {
Self {
rwlock: AsyncRwLock::new(false),
}
}
/// Start up if things are not already started up
/// One must call 'success()' on the returned startup lock guard if startup was successful
/// otherwise the startup lock will not shift to the 'started' state.
pub fn startup(&self) -> Result<StartupLockGuard, StartupLockAlreadyStartedError> {
let guard = asyncrwlock_try_write!(self.rwlock).ok_or(StartupLockAlreadyStartedError)?;
if *guard {
return Err(StartupLockAlreadyStartedError);
}
Ok(StartupLockGuard {
guard,
success_value: true,
})
}
/// Check if this StartupLock is currently in a started state
/// Returns false is the state is in transition
pub fn is_started(&self) -> bool {
let Some(guard) = asyncrwlock_try_read!(self.rwlock) else {
return false;
};
*guard
}
/// Check if this StartupLock is currently in a shut down state
/// Returns false is the state is in transition
pub fn is_shut_down(&self) -> bool {
let Some(guard) = asyncrwlock_try_read!(self.rwlock) else {
return false;
};
!*guard
}
/// Wait for all 'entered' operations to finish before shutting down
/// One must call 'success()' on the returned startup lock guard if shutdown was successful
/// otherwise the startup lock will not shift to the 'stopped' state.
pub async fn shutdown(&self) -> Result<StartupLockGuard, StartupLockAlreadyShutDownError> {
let guard = self.rwlock.write().await;
if !*guard {
return Err(StartupLockAlreadyShutDownError);
}
Ok(StartupLockGuard {
guard,
success_value: false,
})
}
/// Enter an operation in a started-up module.
/// If this module has not yet started up or is in the process of startup or shutdown
/// this will fail.
pub fn enter(&self) -> Result<StartupLockEnterGuard, StartupLockNotStartedError> {
let guard = asyncrwlock_try_read!(self.rwlock).ok_or(StartupLockNotStartedError)?;
if !*guard {
return Err(StartupLockNotStartedError);
}
Ok(StartupLockEnterGuard { _guard: guard })
}
}
impl Default for StartupLock {
fn default() -> Self {
Self::new()
}
}

View File

@ -1,5 +1,6 @@
pub mod test_async_tag_lock;
pub mod test_host_interface;
pub mod test_startup_lock;
#[allow(dead_code)]
pub static DEFAULT_LOG_IGNORE_LIST: [&str; 21] = [

View File

@ -0,0 +1,121 @@
use crate::*;
pub async fn test_startup_shutdown() {
info!("test_startup_shutdown");
let lock = StartupLock::new();
// Normal case
{
let guard = lock.startup().expect("should startup");
guard.success();
}
assert!(lock.is_started());
assert!(!lock.is_shut_down());
{
let guard = lock.shutdown().await.expect("should shutdown");
guard.success();
}
assert!(!lock.is_started());
assert!(lock.is_shut_down());
// Startup fail case
{
lock.startup().expect("should startup");
// Don't call success()
}
assert!(!lock.is_started());
{
lock.shutdown().await.expect_err("should not shutdown");
}
assert!(!lock.is_started());
// Shutdown fail case
{
let guard = lock.startup().expect("should startup");
guard.success();
}
assert!(lock.is_started());
{
lock.shutdown().await.expect("should shutdown");
// Don't call success()
}
assert!(lock.is_started());
{
let guard = lock.shutdown().await.expect("should shutdown");
guard.success();
}
assert!(!lock.is_started());
}
pub async fn test_contention() {
info!("test_contention");
let lock = Arc::new(StartupLock::new());
let val = Arc::new(AtomicBool::new(false));
{
let guard = lock.startup().expect("should startup");
guard.success();
}
assert!(lock.is_started());
let lock2 = lock.clone();
let val2 = val.clone();
let jh = spawn(async move {
let _guard = lock2.enter().expect("should enter");
sleep(2000).await;
val2.store(true, Ordering::Release);
});
sleep(1000).await;
{
let guard = lock.shutdown().await.expect("should shutdown");
assert!(
val.load(Ordering::Acquire),
"should have waited for enter to exit"
);
guard.success();
}
assert!(!lock.is_started());
jh.await;
}
pub async fn test_bad_enter() {
info!("test_bad_enter");
let lock = Arc::new(StartupLock::new());
lock.enter()
.expect_err("should not enter when not started up");
{
let guard = lock.startup().expect("should startup");
guard.success();
}
assert!(lock.is_started());
assert!(!lock.is_shut_down());
let lock2 = lock.clone();
let jh = spawn(async move {
let guard = lock2.shutdown().await.expect("should shutdown");
sleep(2000).await;
guard.success();
});
sleep(1000).await;
assert!(!lock.is_started());
assert!(!lock.is_shut_down());
lock.enter()
.expect_err("should not enter when shutting down");
jh.await;
assert!(!lock.is_started());
assert!(lock.is_shut_down());
lock.enter().expect_err("should not enter when shut down");
}
pub async fn test_all() {
test_startup_shutdown().await;
test_contention().await;
test_bad_enter().await;
}

View File

@ -14,6 +14,8 @@ use super::*;
pub async fn run_all_tests() {
info!("TEST: exec_test_host_interface");
test_host_interface::test_all().await;
info!("TEST: exec_test_startup_lock");
test_startup_lock::test_all().await;
info!("TEST: exec_test_network_interfaces");
test_network_interfaces::test_all().await;
info!("TEST: exec_test_async_peek_stream");
@ -87,6 +89,15 @@ cfg_if! {
});
}
#[test]
#[serial]
fn run_test_startup_lock() {
setup();
block_on(async {
test_startup_lock::test_all().await;
});
}
#[test]
#[serial]
fn run_test_network_interfaces() {

View File

@ -54,6 +54,20 @@ cfg_if::cfg_if! {
$x.clone().try_lock_owned().ok()
};
}
#[macro_export]
macro_rules! asyncrwlock_try_read {
($x:expr) => {
$x.try_read().ok()
};
}
#[macro_export]
macro_rules! asyncrwlock_try_write {
($x:expr) => {
$x.try_write().ok()
};
}
} else {
#[macro_export]
macro_rules! asyncmutex_try_lock {
@ -73,6 +87,18 @@ cfg_if::cfg_if! {
$x.try_lock_arc()
};
}
#[macro_export]
macro_rules! asyncrwlock_try_read {
($x:expr) => {
$x.try_read()
};
}
#[macro_export]
macro_rules! asyncrwlock_try_write {
($x:expr) => {
$x.try_write()
};
}
}
}

View File

@ -44,3 +44,10 @@ async fn run_test_async_tag_lock() {
test_async_tag_lock::test_all().await;
}
#[wasm_bindgen_test]
async fn run_test_startup_lock() {
setup();
test_startup_lock::test_all().await;
}