//! Execution Scheduler - Routes executions to available workers //! //! This module is responsible for: //! - Listening for ExecutionRequested messages //! - Selecting appropriate workers for executions //! - Queuing executions to worker-specific queues //! - Updating execution status to Scheduled //! - Handling worker unavailability and retries use anyhow::Result; use attune_common::{ models::{enums::ExecutionStatus, Action, Execution}, mq::{Consumer, ExecutionRequestedPayload, MessageEnvelope, MessageType, Publisher}, repositories::{ action::ActionRepository, execution::ExecutionRepository, runtime::{RuntimeRepository, WorkerRepository}, FindById, FindByRef, Update, }, }; use chrono::Utc; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use sqlx::PgPool; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Duration; use tracing::{debug, error, info, warn}; /// Payload for execution scheduled messages #[derive(Debug, Clone, Serialize, Deserialize)] struct ExecutionScheduledPayload { execution_id: i64, worker_id: i64, action_ref: String, config: Option, } /// Execution scheduler that routes executions to workers pub struct ExecutionScheduler { pool: PgPool, publisher: Arc, consumer: Arc, /// Round-robin counter for distributing executions across workers round_robin_counter: AtomicUsize, } /// Default heartbeat interval in seconds (should match worker config default) const DEFAULT_HEARTBEAT_INTERVAL: u64 = 30; /// Maximum age multiplier for heartbeat staleness check /// Workers are considered stale if heartbeat is older than HEARTBEAT_INTERVAL * HEARTBEAT_STALENESS_MULTIPLIER const HEARTBEAT_STALENESS_MULTIPLIER: u64 = 3; impl ExecutionScheduler { /// Create a new execution scheduler pub fn new(pool: PgPool, publisher: Arc, consumer: Arc) -> Self { Self { pool, publisher, consumer, round_robin_counter: AtomicUsize::new(0), } } /// Start processing execution requested messages pub async fn start(&self) -> Result<()> { info!("Starting execution scheduler"); let pool = self.pool.clone(); let publisher = self.publisher.clone(); // Share the counter with the handler closure via Arc. // We wrap &self's AtomicUsize in a new Arc by copying the // current value so the closure is 'static. let counter = Arc::new(AtomicUsize::new( self.round_robin_counter.load(Ordering::Relaxed), )); // Use the handler pattern to consume messages self.consumer .consume_with_handler( move |envelope: MessageEnvelope| { let pool = pool.clone(); let publisher = publisher.clone(); let counter = counter.clone(); async move { if let Err(e) = Self::process_execution_requested( &pool, &publisher, &counter, &envelope, ) .await { error!("Error scheduling execution: {}", e); // Return error to trigger nack with requeue return Err(format!("Failed to schedule execution: {}", e).into()); } Ok(()) } }, ) .await?; Ok(()) } /// Process an execution requested message async fn process_execution_requested( pool: &PgPool, publisher: &Publisher, round_robin_counter: &AtomicUsize, envelope: &MessageEnvelope, ) -> Result<()> { debug!("Processing execution requested message: {:?}", envelope); let execution_id = envelope.payload.execution_id; info!("Scheduling execution: {}", execution_id); // Fetch execution from database let mut execution = ExecutionRepository::find_by_id(pool, execution_id) .await? .ok_or_else(|| anyhow::anyhow!("Execution not found: {}", execution_id))?; // Fetch action to determine runtime requirements let action = Self::get_action_for_execution(pool, &execution).await?; // Select appropriate worker (round-robin among compatible workers) let worker = Self::select_worker(pool, &action, round_robin_counter).await?; info!( "Selected worker {} for execution {}", worker.id, execution_id ); // Update execution status to scheduled let execution_config = execution.config.clone(); execution.status = ExecutionStatus::Scheduled; ExecutionRepository::update(pool, execution.id, execution.into()).await?; // Publish message to worker-specific queue Self::queue_to_worker( publisher, &execution_id, &worker.id, &envelope.payload.action_ref, &execution_config, &action, ) .await?; info!( "Execution {} scheduled to worker {}", execution_id, worker.id ); Ok(()) } /// Get the action associated with an execution async fn get_action_for_execution(pool: &PgPool, execution: &Execution) -> Result { // Try to get action by ID first if let Some(action_id) = execution.action { if let Some(action) = ActionRepository::find_by_id(pool, action_id).await? { return Ok(action); } } // Fall back to action_ref ActionRepository::find_by_ref(pool, &execution.action_ref) .await? .ok_or_else(|| anyhow::anyhow!("Action not found for execution: {}", execution.id)) } /// Select an appropriate worker for the execution /// /// Uses round-robin selection among compatible, active, and healthy workers /// to distribute load evenly across the worker pool. async fn select_worker( pool: &PgPool, action: &Action, round_robin_counter: &AtomicUsize, ) -> Result { // Get runtime requirements for the action let runtime = if let Some(runtime_id) = action.runtime { RuntimeRepository::find_by_id(pool, runtime_id).await? } else { None }; // Find available action workers (role = 'action') let workers = WorkerRepository::find_action_workers(pool).await?; if workers.is_empty() { return Err(anyhow::anyhow!("No action workers available")); } // Filter workers by runtime compatibility if runtime is specified let compatible_workers: Vec<_> = if let Some(ref runtime) = runtime { workers .into_iter() .filter(|w| Self::worker_supports_runtime(w, &runtime.name)) .collect() } else { workers }; if compatible_workers.is_empty() { let runtime_name = runtime.as_ref().map(|r| r.name.as_str()).unwrap_or("any"); return Err(anyhow::anyhow!( "No compatible workers found for action: {} (requires runtime: {})", action.r#ref, runtime_name )); } // Filter by worker status (only active workers) let active_workers: Vec<_> = compatible_workers .into_iter() .filter(|w| w.status == Some(attune_common::models::enums::WorkerStatus::Active)) .collect(); if active_workers.is_empty() { return Err(anyhow::anyhow!("No active workers available")); } // Filter by heartbeat freshness (only workers with recent heartbeats) let fresh_workers: Vec<_> = active_workers .into_iter() .filter(|w| Self::is_worker_heartbeat_fresh(w)) .collect(); if fresh_workers.is_empty() { warn!("No workers with fresh heartbeats available. All active workers have stale heartbeats."); return Err(anyhow::anyhow!( "No workers with fresh heartbeats available (heartbeat older than {} seconds)", DEFAULT_HEARTBEAT_INTERVAL * HEARTBEAT_STALENESS_MULTIPLIER )); } // Round-robin selection: distribute executions evenly across workers. // Each call increments the counter and picks the next worker in the list. let count = round_robin_counter.fetch_add(1, Ordering::Relaxed); let index = count % fresh_workers.len(); let selected = fresh_workers .into_iter() .nth(index) .expect("Worker list should not be empty"); info!( "Selected worker {} (id={}) via round-robin (index {} of available workers)", selected.name, selected.id, index ); Ok(selected) } /// Check if a worker supports a given runtime /// /// This checks the worker's capabilities.runtimes array for the runtime name. /// Falls back to checking the deprecated runtime column if capabilities are not set. fn worker_supports_runtime(worker: &attune_common::models::Worker, runtime_name: &str) -> bool { // First, try to parse capabilities and check runtimes array if let Some(ref capabilities) = worker.capabilities { if let Some(runtimes) = capabilities.get("runtimes") { if let Some(runtime_array) = runtimes.as_array() { // Check if any runtime in the array matches (case-insensitive) for runtime_value in runtime_array { if let Some(runtime_str) = runtime_value.as_str() { if runtime_str.eq_ignore_ascii_case(runtime_name) { debug!( "Worker {} supports runtime '{}' via capabilities", worker.name, runtime_name ); return true; } } } } } } // Fallback: check deprecated runtime column // This is kept for backward compatibility but should be removed in the future if worker.runtime.is_some() { debug!( "Worker {} using deprecated runtime column for matching", worker.name ); // Note: This fallback is incomplete because we'd need to look up the runtime name // from the ID, which would require an async call. Since we're moving to capabilities, // we'll just return false here and require workers to set capabilities properly. } debug!( "Worker {} does not support runtime '{}'", worker.name, runtime_name ); false } /// Check if a worker's heartbeat is fresh enough to schedule work /// /// A worker is considered fresh if its last heartbeat is within /// HEARTBEAT_STALENESS_MULTIPLIER * HEARTBEAT_INTERVAL seconds. fn is_worker_heartbeat_fresh(worker: &attune_common::models::Worker) -> bool { let Some(last_heartbeat) = worker.last_heartbeat else { warn!( "Worker {} has no heartbeat recorded, considering stale", worker.name ); return false; }; let now = Utc::now(); let age = now.signed_duration_since(last_heartbeat); let max_age = Duration::from_secs(DEFAULT_HEARTBEAT_INTERVAL * HEARTBEAT_STALENESS_MULTIPLIER); let is_fresh = age.to_std().unwrap_or(Duration::MAX) <= max_age; if !is_fresh { warn!( "Worker {} heartbeat is stale: last seen {} seconds ago (max: {} seconds)", worker.name, age.num_seconds(), max_age.as_secs() ); } else { debug!( "Worker {} heartbeat is fresh: last seen {} seconds ago", worker.name, age.num_seconds() ); } is_fresh } /// Queue execution to a specific worker async fn queue_to_worker( publisher: &Publisher, execution_id: &i64, worker_id: &i64, action_ref: &str, config: &Option, _action: &Action, ) -> Result<()> { debug!("Queuing execution {} to worker {}", execution_id, worker_id); // Create payload for worker let payload = ExecutionScheduledPayload { execution_id: *execution_id, worker_id: *worker_id, action_ref: action_ref.to_string(), config: config.clone(), }; let envelope = MessageEnvelope::new(MessageType::ExecutionRequested, payload).with_source("executor"); // Publish to worker-specific queue with routing key let routing_key = format!("execution.dispatch.worker.{}", worker_id); let exchange = "attune.executions"; publisher .publish_envelope_with_routing(&envelope, exchange, &routing_key) .await?; info!( "Published execution.scheduled message to worker {} (routing key: {})", worker_id, routing_key ); Ok(()) } } #[cfg(test)] mod tests { use super::*; use attune_common::models::{Worker, WorkerRole, WorkerStatus, WorkerType}; use chrono::{Duration as ChronoDuration, Utc}; fn create_test_worker(name: &str, heartbeat_offset_secs: i64) -> Worker { let last_heartbeat = if heartbeat_offset_secs == 0 { None } else { Some(Utc::now() - ChronoDuration::seconds(heartbeat_offset_secs)) }; Worker { id: 1, name: name.to_string(), worker_type: WorkerType::Local, worker_role: WorkerRole::Action, runtime: None, host: Some("localhost".to_string()), port: Some(8080), status: Some(WorkerStatus::Active), capabilities: Some(serde_json::json!({ "runtimes": ["shell", "python"] })), meta: None, last_heartbeat, created: Utc::now(), updated: Utc::now(), } } #[test] fn test_heartbeat_freshness_with_recent_heartbeat() { // Worker with heartbeat 30 seconds ago (within limit) let worker = create_test_worker("test-worker", 30); assert!( ExecutionScheduler::is_worker_heartbeat_fresh(&worker), "Worker with 30s old heartbeat should be considered fresh" ); } #[test] fn test_heartbeat_freshness_with_stale_heartbeat() { // Worker with heartbeat 100 seconds ago (beyond 3x30s = 90s limit) let worker = create_test_worker("test-worker", 100); assert!( !ExecutionScheduler::is_worker_heartbeat_fresh(&worker), "Worker with 100s old heartbeat should be considered stale" ); } #[test] fn test_heartbeat_freshness_at_boundary() { // Worker with heartbeat exactly at the 90 second boundary let worker = create_test_worker("test-worker", 90); assert!( !ExecutionScheduler::is_worker_heartbeat_fresh(&worker), "Worker with 90s old heartbeat should be considered stale (at boundary)" ); } #[test] fn test_heartbeat_freshness_with_no_heartbeat() { // Worker with no heartbeat recorded let worker = create_test_worker("test-worker", 0); assert!( !ExecutionScheduler::is_worker_heartbeat_fresh(&worker), "Worker with no heartbeat should be considered stale" ); } #[test] fn test_heartbeat_freshness_with_very_recent() { // Worker with heartbeat 5 seconds ago let worker = create_test_worker("test-worker", 5); assert!( ExecutionScheduler::is_worker_heartbeat_fresh(&worker), "Worker with 5s old heartbeat should be considered fresh" ); } #[test] fn test_scheduler_creation() { // This is a placeholder test // Real tests will require database and message queue setup assert!(true); } }