tembo-io · vrmiguel · Oct 21, 2024 · Oct 21, 2024 · vrmiguel · Oct 21, 2024
@@ -0,0 +1,85 @@
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    Arc,
+};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+fn current_timestamp() -> u64 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .expect("now() is not later than UNIX_EPOCH")
+        .as_secs()
+}
+
+#[derive(Clone)]
+pub struct HeartbeatMonitor {
+    shared_heartbeat: Arc<AtomicU64>,
+    update_interval: Duration,
+}
+
+#[derive(Clone)]
+pub struct HeartbeatUpdater {
+    shared_heartbeat: Arc<AtomicU64>,
+}
+
+/// Initializes and returns both a [`HeartbeatMonitor`] and [`HeartbeatUpdater`].
+pub fn start(expected_update_interval: Duration) -> (HeartbeatMonitor, HeartbeatUpdater) {
+    let heartbeat = Arc::new(AtomicU64::new(current_timestamp()));
+
+    let heartbeat_monitor = HeartbeatMonitor {
+        shared_heartbeat: heartbeat.clone(),
+        update_interval: expected_update_interval,
+    };
+    let heartbeat_updater = HeartbeatUpdater {
+        shared_heartbeat: heartbeat,
+    };
+
+    (heartbeat_monitor, heartbeat_updater)
+}
+
+impl HeartbeatMonitor {
+    /// Checks if the heartbeat is still active
+    ///
+    /// # Returns true if the heartbeat has been updated within the expected time frame, false if the heartbeat has not been updated within twice the expected timeout duration
+    pub fn is_heartbeat_active(&self) -> bool {
+        let last_update = self.shared_heartbeat.load(Ordering::Relaxed);
+        let current_time = current_timestamp();
+
+        if current_time >= last_update {
+            let elapsed = Duration::from_secs(current_time - last_update);
+            elapsed < self.update_interval * 2
+        } else {
+            // System time went backwards or clock drift, consider the heartbeat stale
+            false
+        }
+    }
+}
+
+impl HeartbeatUpdater {
+    pub fn update_heartbeat(&self) {
+        self.shared_heartbeat
+            .store(current_timestamp(), Ordering::Relaxed);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use crate::heartbeat_monitor;
+
+    #[tokio::test]
+    async fn check_heartbeat_monitor() {
+        let (monitor, updater) = heartbeat_monitor::start(Duration::from_secs(1));
+
+        // Is alive since there's been an update in the last second
+        assert!(monitor.is_heartbeat_active());
+
+        tokio::time::sleep(Duration::from_secs(4)).await;
+
+        assert_eq!(monitor.is_heartbeat_active(), false);
+        updater.update_heartbeat();
+
+        assert!(monitor.is_heartbeat_active());
+    }
+}
@@ -3,6 +3,7 @@ pub mod cloud;
 pub mod errors;
 pub mod extensions;
 pub mod gcp;
+pub mod heartbeat_monitor;
 pub mod metrics;
 pub mod monitoring;
 pub mod routes;

@@ -1,6 +1,7 @@
 use actix_web::{web, App, HttpServer};
 use actix_web_opentelemetry::{PrometheusMetricsHandler, RequestTracing};
 use conductor::errors::ConductorError;
+use conductor::heartbeat_monitor;
 use conductor::monitoring::CustomMetrics;
 use conductor::{
     cloud::CloudProvider, create_cloudformation, create_gcp_storage_workload_identity_binding,
@@ -28,6 +29,7 @@ use sqlx::error::Error;
 use sqlx::postgres::PgPoolOptions;
 use std::env;
 use std::sync::{Arc, Mutex};
+use std::time::Duration;
 use std::{thread, time};
 use types::{CRUDevent, Event};
 
@@ -695,6 +697,8 @@ async fn main() -> std::io::Result<()> {
     let status_reporter_enabled = from_env_default("WATCHER_ENABLED", "true");
     let metrics_reported_enabled = from_env_default("METRICS_REPORTER_ENABLED", "false");
 
+    let (heartbeat_monitor, heartbeat_updater) = heartbeat_monitor::start(Duration::from_secs(60));
+
     if conductor_enabled != "false" {
         info!("Starting conductor");
         background_threads_locked.push(tokio::spawn({
@@ -759,7 +763,7 @@ async fn main() -> std::io::Result<()> {
         let custom_metrics_copy = custom_metrics.clone();
         background_threads_locked.push(tokio::spawn(async move {
             let custom_metrics = &custom_metrics_copy;
-            if let Err(err) = run_metrics_reporter().await {
+            if let Err(err) = run_metrics_reporter(heartbeat_updater.clone()).await {
                 custom_metrics
                     .conductor_errors
                     .add(&opentelemetry::Context::current(), 1, &[]);
@@ -783,6 +787,7 @@ async fn main() -> std::io::Result<()> {
         App::new()
             .app_data(web::Data::new(custom_metrics.clone()))
             .app_data(web::Data::new(background_threads.clone()))
+            .app_data(web::Data::new(heartbeat_monitor.clone()))
             .wrap(RequestTracing::new())
             .route(
                 "/metrics",

@@ -1,4 +1,5 @@
 use anyhow::{bail, Context, Result};
+use conductor::heartbeat_monitor::HeartbeatUpdater;
 use conductor::metrics::dataplane_metrics::split_data_plane_metrics;
 use conductor::metrics::{dataplane_metrics::DataPlaneMetrics, prometheus::Metrics};
 use log::{error, info};
@@ -28,7 +29,7 @@ fn load_metric_queries() -> Result<MetricQueries> {
     serde_yaml::from_str(METRICS_FILE).map_err(Into::into)
 }
 
-pub async fn run_metrics_reporter() -> Result<()> {
+pub async fn run_metrics_reporter(hearbeat_updater: HeartbeatUpdater) -> Result<()> {
     let client = Client::new().await;
 
     let MetricQueries { metrics } = load_metric_queries()?;
@@ -55,6 +56,7 @@ pub async fn run_metrics_reporter() -> Result<()> {
 
     loop {
         sync_interval.tick().await;
+        hearbeat_updater.update_heartbeat();
 
         let now = Instant::now();
         for metric in &metrics {

@@ -1,9 +1,15 @@
 use actix_web::{get, web, HttpResponse, Responder};
-use std::sync::{Arc, Mutex};
+use std::{
+    ops::Not,
+    sync::{Arc, Mutex},
+};
+
+use crate::heartbeat_monitor::HeartbeatMonitor;
 
 #[get("/lively")]
 pub async fn background_threads_running(
     background_threads: web::Data<Arc<Mutex<Vec<tokio::task::JoinHandle<()>>>>>,
+    heartbeat_monitor: web::Data<HeartbeatMonitor>,
 ) -> impl Responder {
     let background_threads = match background_threads.lock() {
         Ok(threads) => threads,
@@ -12,6 +18,12 @@ pub async fn background_threads_running(
                 .body("Failed to check if background tasks are running.")
         }
     };
+
+    if heartbeat_monitor.is_heartbeat_active().not() {
+        return HttpResponse::InternalServerError()
+            .body("One or more background tasks are not responding.");
+    }
+
     for thread in background_threads.iter() {
         if thread.is_finished() {
             return HttpResponse::InternalServerError()