//! Execution Timeout Monitor //! //! This module monitors executions in SCHEDULED status and fails them if they //! don't transition to RUNNING within a configured timeout period. //! //! This prevents executions from being stuck indefinitely when workers: //! - Stop or crash after being selected //! - Fail to consume messages from their queues //! - Are partitioned from the network use anyhow::Result; use attune_common::{ models::{enums::ExecutionStatus, Execution}, mq::{MessageEnvelope, MessageType, Publisher}, repositories::execution::SELECT_COLUMNS as EXECUTION_COLUMNS, }; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use sqlx::PgPool; use std::sync::Arc; use std::time::Duration; use tokio::time::interval; use tracing::{debug, error, info, warn}; /// Configuration for timeout monitor #[derive(Debug, Clone)] pub struct TimeoutMonitorConfig { /// How long an execution can remain in SCHEDULED status before timing out pub scheduled_timeout: Duration, /// How often to check for stale executions pub check_interval: Duration, /// Whether to enable the timeout monitor pub enabled: bool, } impl Default for TimeoutMonitorConfig { fn default() -> Self { Self { scheduled_timeout: Duration::from_secs(300), // 5 minutes check_interval: Duration::from_secs(60), // 1 minute enabled: true, } } } /// Payload for execution completion messages #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ExecutionCompletedPayload { pub execution_id: i64, pub status: ExecutionStatus, pub result: Option, } /// Monitors scheduled executions and fails those that timeout pub struct ExecutionTimeoutMonitor { pool: PgPool, publisher: Arc, config: TimeoutMonitorConfig, } impl ExecutionTimeoutMonitor { /// Create a new timeout monitor pub fn new(pool: PgPool, publisher: Arc, config: TimeoutMonitorConfig) -> Self { Self { pool, publisher, config, } } /// Start the timeout monitor loop pub async fn start(self: Arc) -> Result<()> { if !self.config.enabled { info!("Execution timeout monitor is disabled"); return Ok(()); } info!( "Starting execution timeout monitor (timeout: {}s, check interval: {}s)", self.config.scheduled_timeout.as_secs(), self.config.check_interval.as_secs() ); let mut check_interval = interval(self.config.check_interval); loop { check_interval.tick().await; if let Err(e) = self.check_stale_executions().await { error!("Error checking stale executions: {}", e); // Continue running despite errors } } } /// Check for executions stuck in SCHEDULED status async fn check_stale_executions(&self) -> Result<()> { let cutoff = self.calculate_cutoff_time(); debug!( "Checking for executions scheduled before {}", cutoff.format("%Y-%m-%d %H:%M:%S UTC") ); // Find executions stuck in SCHEDULED status let sql = format!( "SELECT {EXECUTION_COLUMNS} FROM execution \ WHERE status = $1 AND updated < $2 \ ORDER BY updated ASC LIMIT 100" ); let stale_executions = sqlx::query_as::<_, Execution>(&sql) .bind(ExecutionStatus::Scheduled) .bind(cutoff) .fetch_all(&self.pool) .await?; if stale_executions.is_empty() { debug!("No stale scheduled executions found"); return Ok(()); } warn!( "Found {} stale scheduled executions (older than {}s)", stale_executions.len(), self.config.scheduled_timeout.as_secs() ); for execution in stale_executions { let age_seconds = (Utc::now() - execution.updated).num_seconds(); warn!( "Execution {} has been scheduled for {} seconds (timeout: {}s), marking as failed", execution.id, age_seconds, self.config.scheduled_timeout.as_secs() ); if let Err(e) = self.fail_execution(&execution, age_seconds).await { error!("Failed to fail execution {}: {}", execution.id, e); // Continue processing other executions } } Ok(()) } /// Calculate the cutoff time for stale executions fn calculate_cutoff_time(&self) -> DateTime { let timeout_duration = chrono::Duration::from_std(self.config.scheduled_timeout) .expect("Invalid timeout duration"); Utc::now() - timeout_duration } /// Mark an execution as failed due to timeout async fn fail_execution(&self, execution: &Execution, age_seconds: i64) -> Result<()> { let execution_id = execution.id; let error_message = format!( "Execution timeout: worker did not pick up task within {} seconds (scheduled for {} seconds)", self.config.scheduled_timeout.as_secs(), age_seconds ); info!( "Failing execution {} due to timeout: {}", execution_id, error_message ); // Create failure result let result = serde_json::json!({ "error": error_message, "failed_by": "execution_timeout_monitor", "timeout_seconds": self.config.scheduled_timeout.as_secs(), "age_seconds": age_seconds, "original_status": "scheduled" }); // Update execution status in database sqlx::query( "UPDATE execution SET status = $1, result = $2, updated = NOW() WHERE id = $3", ) .bind(ExecutionStatus::Failed) .bind(&result) .bind(execution_id) .execute(&self.pool) .await?; info!("Execution {} marked as failed in database", execution_id); // Publish completion notification self.publish_completion_notification(execution_id, result) .await?; info!( "Published completion notification for execution {}", execution_id ); Ok(()) } /// Publish execution completion notification async fn publish_completion_notification( &self, execution_id: i64, result: JsonValue, ) -> Result<()> { let payload = ExecutionCompletedPayload { execution_id, status: ExecutionStatus::Failed, result: Some(result), }; let envelope = MessageEnvelope::new(MessageType::ExecutionCompleted, payload) .with_source("execution_timeout_monitor"); // Publish to main executions exchange self.publisher.publish_envelope(&envelope).await?; Ok(()) } /// Get current configuration #[allow(dead_code)] pub fn config(&self) -> &TimeoutMonitorConfig { &self.config } } #[cfg(test)] mod tests { use super::*; use chrono::Duration as ChronoDuration; fn create_test_config() -> TimeoutMonitorConfig { TimeoutMonitorConfig { scheduled_timeout: Duration::from_secs(60), // 1 minute for tests check_interval: Duration::from_secs(1), // 1 second for tests enabled: true, } } #[test] fn test_config_defaults() { let config = TimeoutMonitorConfig::default(); assert_eq!(config.scheduled_timeout.as_secs(), 300); assert_eq!(config.check_interval.as_secs(), 60); assert!(config.enabled); } #[test] fn test_cutoff_calculation() { // Test that cutoff is calculated as now - scheduled_timeout let config = create_test_config(); // scheduled_timeout = 60s let before = Utc::now() - ChronoDuration::seconds(60); // calculate_cutoff uses Utc::now() internally, so we compute expected bounds let timeout_duration = chrono::Duration::from_std(config.scheduled_timeout).expect("Invalid timeout duration"); let cutoff = Utc::now() - timeout_duration; let after = Utc::now() - ChronoDuration::seconds(60); // cutoff should be between before and after (both ~60s ago) let diff_before = (cutoff - before).num_seconds().abs(); let diff_after = (cutoff - after).num_seconds().abs(); assert!( diff_before <= 1, "Cutoff time should be ~60s ago (before check)" ); assert!( diff_after <= 1, "Cutoff time should be ~60s ago (after check)" ); } #[test] fn test_disabled_config() { let mut config = create_test_config(); config.enabled = false; // Verify the config is properly set to disabled assert!(!config.enabled); assert_eq!(config.scheduled_timeout.as_secs(), 60); assert_eq!(config.check_interval.as_secs(), 1); } }