diff --git a/io-engine/src/bdev/nexus/nexus_persistence.rs b/io-engine/src/bdev/nexus/nexus_persistence.rs index 7fefa8918..5f0ee6eb4 100644 --- a/io-engine/src/bdev/nexus/nexus_persistence.rs +++ b/io-engine/src/bdev/nexus/nexus_persistence.rs @@ -1,7 +1,6 @@ use super::{IoMode, Nexus, NexusChild}; use crate::{persistent_store::PersistentStore, sleep::mayastor_sleep}; use serde::{Deserialize, Serialize}; -use std::time::Duration; use super::Error; @@ -215,6 +214,8 @@ impl<'n> Nexus<'n> { }; let mut retry = PersistentStore::retries(); + let interval = PersistentStore::interval(); + let mut log = true; loop { let Err(err) = PersistentStore::put(&key, &info.inner).await else { trace!(?key, "{self:?}: the state was saved successfully"); @@ -223,20 +224,26 @@ impl<'n> Nexus<'n> { retry -= 1; if retry == 0 { + error!( + "{self:?}: failed to persist nexus information: {err}, giving up..." + ); return Err(Error::SaveStateFailed { source: err, name: self.name.clone(), }); } - error!( - "{self:?}: failed to persist nexus information, \ - will retry ({retry} left): {err}" - ); + if log { + error!( + "{self:?}: failed to persist nexus information, \ + will retry silently ({retry} left): {err}..." + ); + log = false; + } // Allow some time for the connection to the persistent // store to be re-established before retrying the operation. - if mayastor_sleep(Duration::from_secs(1)).await.is_err() { + if mayastor_sleep(interval).await.is_err() { error!("{self:?}: failed to wait for sleep"); } } diff --git a/io-engine/src/bin/io-engine.rs b/io-engine/src/bin/io-engine.rs index 882fe45bf..7fc1d41ba 100644 --- a/io-engine/src/bin/io-engine.rs +++ b/io-engine/src/bin/io-engine.rs @@ -61,6 +61,7 @@ fn start_tokio_runtime(args: &MayastorCliArgs) { let ps_endpoint = args.ps_endpoint.clone(); let ps_timeout = args.ps_timeout; let ps_retries = args.ps_retries; + let ps_interval = args.ps_interval; let reactor_freeze_detection = args.reactor_freeze_detection; let reactor_freeze_timeout = args.reactor_freeze_timeout; @@ -100,6 +101,7 @@ fn start_tokio_runtime(args: &MayastorCliArgs) { .with_endpoint(endpoint) .with_timeout(ps_timeout) .with_retries(ps_retries) + .with_interval(ps_interval) .connect() .await; } diff --git a/io-engine/src/core/env.rs b/io-engine/src/core/env.rs index 704d7d738..28908e29b 100644 --- a/io-engine/src/core/env.rs +++ b/io-engine/src/core/env.rs @@ -172,14 +172,21 @@ pub struct MayastorCliArgs { pub ps_endpoint: Option, #[clap( long = "ps-timeout", - default_value = "10s", + default_value = "15s", value_parser = parse_ps_timeout, )] /// Persistent store timeout. pub ps_timeout: Duration, - #[clap(long = "ps-retries", default_value = "30")] + #[clap(long = "ps-retries", default_value = "100")] /// Persistent store operation retries. pub ps_retries: u8, + #[clap( + long = "ps-interval", + default_value = "2s", + value_parser = parse_ps_timeout, + )] + /// Persistent store interval between retries. + pub ps_interval: Duration, #[clap(long = "bdev-pool-size", default_value = "65535")] /// Number of entries in memory pool for bdev I/O contexts pub bdev_io_ctx_pool_size: u64, @@ -260,6 +267,7 @@ impl Default for MayastorCliArgs { ps_endpoint: None, ps_timeout: Duration::from_secs(10), ps_retries: 30, + ps_interval: Duration::from_secs(2), node_name: None, env_context: None, reactor_mask: "0x1".into(), @@ -348,6 +356,7 @@ pub struct MayastorEnvironment { ps_endpoint: Option, ps_timeout: Duration, ps_retries: u8, + ps_interval: Duration, mayastor_config: Option, ptpl_dir: Option, pool_config: Option, @@ -396,6 +405,7 @@ impl Default for MayastorEnvironment { ps_endpoint: None, ps_timeout: Duration::from_secs(10), ps_retries: 30, + ps_interval: Duration::from_secs(1), mayastor_config: None, ptpl_dir: None, pool_config: None, @@ -1015,6 +1025,7 @@ impl MayastorEnvironment { let ps_endpoint = self.ps_endpoint.clone(); let ps_timeout = self.ps_timeout; let ps_retries = self.ps_retries; + let ps_interval = self.ps_interval; let grpc_endpoint = self.grpc_endpoint; let rpc_addr = self.rpc_addr.clone(); let api_versions = self.api_versions.clone(); @@ -1029,6 +1040,7 @@ impl MayastorEnvironment { .with_endpoint(&ps_endpoint) .with_timeout(ps_timeout) .with_retries(ps_retries) + .with_interval(ps_interval) .connect() .await; } diff --git a/io-engine/src/persistent_store.rs b/io-engine/src/persistent_store.rs index a86dfa550..92d32e5b6 100644 --- a/io-engine/src/persistent_store.rs +++ b/io-engine/src/persistent_store.rs @@ -37,6 +37,8 @@ pub struct PersistentStoreBuilder { timeout: Duration, /// Number of operation retries. retries: u8, + /// Interval duration. + interval: Duration, } impl Default for PersistentStoreBuilder { @@ -53,6 +55,7 @@ impl PersistentStoreBuilder { default_port: 2379, timeout: Duration::from_secs(1), retries: 5, + interval: Duration::from_secs(1), } } @@ -84,6 +87,12 @@ impl PersistentStoreBuilder { self } + /// Sets interval between operation retries. + pub fn with_interval(mut self, interval: Duration) -> Self { + self.interval = interval; + self + } + /// Consumes `PersistentStoreBuilder` instance and initialises the /// persistent store. If the supplied endpoint is 'None', the store is /// uninitalised and unavailable for use. @@ -102,6 +111,8 @@ pub struct PersistentStore { timeout: Duration, /// Number of operation retries. retries: u8, + /// Operation interval. + interval: Duration, } /// Persistent store global instance. @@ -120,6 +131,7 @@ impl PersistentStore { let timeout = bld.timeout; let retries = bld.retries; + let interval = bld.interval; let store = Self::connect_to_backing_store(&endpoint.clone()).await; info!( @@ -133,6 +145,7 @@ impl PersistentStore { endpoint, timeout, retries, + interval, }) }); } @@ -303,6 +316,11 @@ impl PersistentStore { Self::instance().lock().timeout } + /// Gets the operation interval. + pub fn interval() -> Duration { + Self::instance().lock().interval + } + /// Gets the number of operation retries. pub fn retries() -> u8 { Self::instance().lock().retries diff --git a/io-engine/tests/nexus_child_retire.rs b/io-engine/tests/nexus_child_retire.rs index 5f7507298..6a86e37b6 100644 --- a/io-engine/tests/nexus_child_retire.rs +++ b/io-engine/tests/nexus_child_retire.rs @@ -64,6 +64,7 @@ fn get_ms() -> &'static MayastorTest<'static> { MAYASTOR.get_or_init(|| { MayastorTest::new(MayastorCliArgs { enable_io_all_thrd_nexus_channels: true, + ps_interval: Duration::from_secs(1), ..Default::default() }) }) @@ -549,6 +550,7 @@ async fn init_ms_etcd_test() -> ComposeTest { .with_endpoint(ETCD_ENDPOINT) .with_timeout(Duration::from_secs(1)) .with_retries(5) + .with_interval(Duration::from_secs(1)) .connect() .await;