Skip to content

Commit

Permalink
Merge #1564
Browse files Browse the repository at this point in the history
1564: fix(pstor): increase persistence timeouts r=tiagolobocastro a=tiagolobocastro

The pstor might might unavailable for some time, for example during upgrade. We should have sufficient timeouts to cope with this. Also pstor can get slow at times as it's writing to disk, so we also should have a larger timeout on store.

todo: should we still carry on trying to persist on separate task after we fail the nexus?
todo: what happens when we try to shutdown nexus first and pstor fails?

We'll revisit this post-release.

Co-authored-by: Tiago Castro <tiagolobocastro@gmail.com>
  • Loading branch information
mayastor-bors and tiagolobocastro committed Dec 15, 2023
2 parents 51b47df + ac4a5f3 commit 874a585
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 8 deletions.
19 changes: 13 additions & 6 deletions io-engine/src/bdev/nexus/nexus_persistence.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use super::{IoMode, Nexus, NexusChild};
use crate::{persistent_store::PersistentStore, sleep::mayastor_sleep};
use serde::{Deserialize, Serialize};
use std::time::Duration;

use super::Error;

Expand Down Expand Up @@ -215,6 +214,8 @@ impl<'n> Nexus<'n> {
};

let mut retry = PersistentStore::retries();
let interval = PersistentStore::interval();
let mut log = true;
loop {
let Err(err) = PersistentStore::put(&key, &info.inner).await else {
trace!(?key, "{self:?}: the state was saved successfully");
Expand All @@ -223,20 +224,26 @@ impl<'n> Nexus<'n> {

retry -= 1;
if retry == 0 {
error!(
"{self:?}: failed to persist nexus information: {err}, giving up..."
);
return Err(Error::SaveStateFailed {
source: err,
name: self.name.clone(),
});
}

error!(
"{self:?}: failed to persist nexus information, \
will retry ({retry} left): {err}"
);
if log {
error!(
"{self:?}: failed to persist nexus information, \
will retry silently ({retry} left): {err}..."
);
log = false;
}

// Allow some time for the connection to the persistent
// store to be re-established before retrying the operation.
if mayastor_sleep(Duration::from_secs(1)).await.is_err() {
if mayastor_sleep(interval).await.is_err() {
error!("{self:?}: failed to wait for sleep");
}
}
Expand Down
2 changes: 2 additions & 0 deletions io-engine/src/bin/io-engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ fn start_tokio_runtime(args: &MayastorCliArgs) {
let ps_endpoint = args.ps_endpoint.clone();
let ps_timeout = args.ps_timeout;
let ps_retries = args.ps_retries;
let ps_interval = args.ps_interval;

let reactor_freeze_detection = args.reactor_freeze_detection;
let reactor_freeze_timeout = args.reactor_freeze_timeout;
Expand Down Expand Up @@ -100,6 +101,7 @@ fn start_tokio_runtime(args: &MayastorCliArgs) {
.with_endpoint(endpoint)
.with_timeout(ps_timeout)
.with_retries(ps_retries)
.with_interval(ps_interval)
.connect()
.await;
}
Expand Down
16 changes: 14 additions & 2 deletions io-engine/src/core/env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,21 @@ pub struct MayastorCliArgs {
pub ps_endpoint: Option<String>,
#[clap(
long = "ps-timeout",
default_value = "10s",
default_value = "15s",
value_parser = parse_ps_timeout,
)]
/// Persistent store timeout.
pub ps_timeout: Duration,
#[clap(long = "ps-retries", default_value = "30")]
#[clap(long = "ps-retries", default_value = "100")]
/// Persistent store operation retries.
pub ps_retries: u8,
#[clap(
long = "ps-interval",
default_value = "2s",
value_parser = parse_ps_timeout,
)]
/// Persistent store interval between retries.
pub ps_interval: Duration,
#[clap(long = "bdev-pool-size", default_value = "65535")]
/// Number of entries in memory pool for bdev I/O contexts
pub bdev_io_ctx_pool_size: u64,
Expand Down Expand Up @@ -260,6 +267,7 @@ impl Default for MayastorCliArgs {
ps_endpoint: None,
ps_timeout: Duration::from_secs(10),
ps_retries: 30,
ps_interval: Duration::from_secs(2),
node_name: None,
env_context: None,
reactor_mask: "0x1".into(),
Expand Down Expand Up @@ -348,6 +356,7 @@ pub struct MayastorEnvironment {
ps_endpoint: Option<String>,
ps_timeout: Duration,
ps_retries: u8,
ps_interval: Duration,
mayastor_config: Option<String>,
ptpl_dir: Option<String>,
pool_config: Option<String>,
Expand Down Expand Up @@ -396,6 +405,7 @@ impl Default for MayastorEnvironment {
ps_endpoint: None,
ps_timeout: Duration::from_secs(10),
ps_retries: 30,
ps_interval: Duration::from_secs(1),
mayastor_config: None,
ptpl_dir: None,
pool_config: None,
Expand Down Expand Up @@ -1015,6 +1025,7 @@ impl MayastorEnvironment {
let ps_endpoint = self.ps_endpoint.clone();
let ps_timeout = self.ps_timeout;
let ps_retries = self.ps_retries;
let ps_interval = self.ps_interval;
let grpc_endpoint = self.grpc_endpoint;
let rpc_addr = self.rpc_addr.clone();
let api_versions = self.api_versions.clone();
Expand All @@ -1029,6 +1040,7 @@ impl MayastorEnvironment {
.with_endpoint(&ps_endpoint)
.with_timeout(ps_timeout)
.with_retries(ps_retries)
.with_interval(ps_interval)
.connect()
.await;
}
Expand Down
18 changes: 18 additions & 0 deletions io-engine/src/persistent_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ pub struct PersistentStoreBuilder {
timeout: Duration,
/// Number of operation retries.
retries: u8,
/// Interval duration.
interval: Duration,
}

impl Default for PersistentStoreBuilder {
Expand All @@ -53,6 +55,7 @@ impl PersistentStoreBuilder {
default_port: 2379,
timeout: Duration::from_secs(1),
retries: 5,
interval: Duration::from_secs(1),
}
}

Expand Down Expand Up @@ -84,6 +87,12 @@ impl PersistentStoreBuilder {
self
}

/// Sets interval between operation retries.
pub fn with_interval(mut self, interval: Duration) -> Self {
self.interval = interval;
self
}

/// Consumes `PersistentStoreBuilder` instance and initialises the
/// persistent store. If the supplied endpoint is 'None', the store is
/// uninitalised and unavailable for use.
Expand All @@ -102,6 +111,8 @@ pub struct PersistentStore {
timeout: Duration,
/// Number of operation retries.
retries: u8,
/// Operation interval.
interval: Duration,
}

/// Persistent store global instance.
Expand All @@ -120,6 +131,7 @@ impl PersistentStore {

let timeout = bld.timeout;
let retries = bld.retries;
let interval = bld.interval;
let store = Self::connect_to_backing_store(&endpoint.clone()).await;

info!(
Expand All @@ -133,6 +145,7 @@ impl PersistentStore {
endpoint,
timeout,
retries,
interval,
})
});
}
Expand Down Expand Up @@ -303,6 +316,11 @@ impl PersistentStore {
Self::instance().lock().timeout
}

/// Gets the operation interval.
pub fn interval() -> Duration {
Self::instance().lock().interval
}

/// Gets the number of operation retries.
pub fn retries() -> u8 {
Self::instance().lock().retries
Expand Down
2 changes: 2 additions & 0 deletions io-engine/tests/nexus_child_retire.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ fn get_ms() -> &'static MayastorTest<'static> {
MAYASTOR.get_or_init(|| {
MayastorTest::new(MayastorCliArgs {
enable_io_all_thrd_nexus_channels: true,
ps_interval: Duration::from_secs(1),
..Default::default()
})
})
Expand Down Expand Up @@ -549,6 +550,7 @@ async fn init_ms_etcd_test() -> ComposeTest {
.with_endpoint(ETCD_ENDPOINT)
.with_timeout(Duration::from_secs(1))
.with_retries(5)
.with_interval(Duration::from_secs(1))
.connect()
.await;

Expand Down

0 comments on commit 874a585

Please sign in to comment.