467 lines
16 KiB
Rust
467 lines
16 KiB
Rust
//! Execution Scheduler - Routes executions to available workers
|
|
//!
|
|
//! This module is responsible for:
|
|
//! - Listening for ExecutionRequested messages
|
|
//! - Selecting appropriate workers for executions
|
|
//! - Queuing executions to worker-specific queues
|
|
//! - Updating execution status to Scheduled
|
|
//! - Handling worker unavailability and retries
|
|
|
|
use anyhow::Result;
|
|
use attune_common::{
|
|
models::{enums::ExecutionStatus, Action, Execution},
|
|
mq::{Consumer, ExecutionRequestedPayload, MessageEnvelope, MessageType, Publisher},
|
|
repositories::{
|
|
action::ActionRepository,
|
|
execution::ExecutionRepository,
|
|
runtime::{RuntimeRepository, WorkerRepository},
|
|
FindById, FindByRef, Update,
|
|
},
|
|
};
|
|
use chrono::Utc;
|
|
use serde::{Deserialize, Serialize};
|
|
use serde_json::Value as JsonValue;
|
|
use sqlx::PgPool;
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
use tracing::{debug, error, info, warn};
|
|
|
|
/// Payload for execution scheduled messages
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct ExecutionScheduledPayload {
|
|
execution_id: i64,
|
|
worker_id: i64,
|
|
action_ref: String,
|
|
config: Option<JsonValue>,
|
|
}
|
|
|
|
/// Execution scheduler that routes executions to workers
|
|
pub struct ExecutionScheduler {
|
|
pool: PgPool,
|
|
publisher: Arc<Publisher>,
|
|
consumer: Arc<Consumer>,
|
|
/// Round-robin counter for distributing executions across workers
|
|
round_robin_counter: AtomicUsize,
|
|
}
|
|
|
|
/// Default heartbeat interval in seconds (should match worker config default)
|
|
const DEFAULT_HEARTBEAT_INTERVAL: u64 = 30;
|
|
|
|
/// Maximum age multiplier for heartbeat staleness check
|
|
/// Workers are considered stale if heartbeat is older than HEARTBEAT_INTERVAL * HEARTBEAT_STALENESS_MULTIPLIER
|
|
const HEARTBEAT_STALENESS_MULTIPLIER: u64 = 3;
|
|
|
|
impl ExecutionScheduler {
|
|
/// Create a new execution scheduler
|
|
pub fn new(pool: PgPool, publisher: Arc<Publisher>, consumer: Arc<Consumer>) -> Self {
|
|
Self {
|
|
pool,
|
|
publisher,
|
|
consumer,
|
|
round_robin_counter: AtomicUsize::new(0),
|
|
}
|
|
}
|
|
|
|
/// Start processing execution requested messages
|
|
pub async fn start(&self) -> Result<()> {
|
|
info!("Starting execution scheduler");
|
|
|
|
let pool = self.pool.clone();
|
|
let publisher = self.publisher.clone();
|
|
// Share the counter with the handler closure via Arc.
|
|
// We wrap &self's AtomicUsize in a new Arc<AtomicUsize> by copying the
|
|
// current value so the closure is 'static.
|
|
let counter = Arc::new(AtomicUsize::new(
|
|
self.round_robin_counter.load(Ordering::Relaxed),
|
|
));
|
|
|
|
// Use the handler pattern to consume messages
|
|
self.consumer
|
|
.consume_with_handler(
|
|
move |envelope: MessageEnvelope<ExecutionRequestedPayload>| {
|
|
let pool = pool.clone();
|
|
let publisher = publisher.clone();
|
|
let counter = counter.clone();
|
|
|
|
async move {
|
|
if let Err(e) = Self::process_execution_requested(
|
|
&pool, &publisher, &counter, &envelope,
|
|
)
|
|
.await
|
|
{
|
|
error!("Error scheduling execution: {}", e);
|
|
// Return error to trigger nack with requeue
|
|
return Err(format!("Failed to schedule execution: {}", e).into());
|
|
}
|
|
Ok(())
|
|
}
|
|
},
|
|
)
|
|
.await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Process an execution requested message
|
|
async fn process_execution_requested(
|
|
pool: &PgPool,
|
|
publisher: &Publisher,
|
|
round_robin_counter: &AtomicUsize,
|
|
envelope: &MessageEnvelope<ExecutionRequestedPayload>,
|
|
) -> Result<()> {
|
|
debug!("Processing execution requested message: {:?}", envelope);
|
|
|
|
let execution_id = envelope.payload.execution_id;
|
|
|
|
info!("Scheduling execution: {}", execution_id);
|
|
|
|
// Fetch execution from database
|
|
let mut execution = ExecutionRepository::find_by_id(pool, execution_id)
|
|
.await?
|
|
.ok_or_else(|| anyhow::anyhow!("Execution not found: {}", execution_id))?;
|
|
|
|
// Fetch action to determine runtime requirements
|
|
let action = Self::get_action_for_execution(pool, &execution).await?;
|
|
|
|
// Select appropriate worker (round-robin among compatible workers)
|
|
let worker = Self::select_worker(pool, &action, round_robin_counter).await?;
|
|
|
|
info!(
|
|
"Selected worker {} for execution {}",
|
|
worker.id, execution_id
|
|
);
|
|
|
|
// Update execution status to scheduled
|
|
let execution_config = execution.config.clone();
|
|
execution.status = ExecutionStatus::Scheduled;
|
|
ExecutionRepository::update(pool, execution.id, execution.into()).await?;
|
|
|
|
// Publish message to worker-specific queue
|
|
Self::queue_to_worker(
|
|
publisher,
|
|
&execution_id,
|
|
&worker.id,
|
|
&envelope.payload.action_ref,
|
|
&execution_config,
|
|
&action,
|
|
)
|
|
.await?;
|
|
|
|
info!(
|
|
"Execution {} scheduled to worker {}",
|
|
execution_id, worker.id
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get the action associated with an execution
|
|
async fn get_action_for_execution(pool: &PgPool, execution: &Execution) -> Result<Action> {
|
|
// Try to get action by ID first
|
|
if let Some(action_id) = execution.action {
|
|
if let Some(action) = ActionRepository::find_by_id(pool, action_id).await? {
|
|
return Ok(action);
|
|
}
|
|
}
|
|
|
|
// Fall back to action_ref
|
|
ActionRepository::find_by_ref(pool, &execution.action_ref)
|
|
.await?
|
|
.ok_or_else(|| anyhow::anyhow!("Action not found for execution: {}", execution.id))
|
|
}
|
|
|
|
/// Select an appropriate worker for the execution
|
|
///
|
|
/// Uses round-robin selection among compatible, active, and healthy workers
|
|
/// to distribute load evenly across the worker pool.
|
|
async fn select_worker(
|
|
pool: &PgPool,
|
|
action: &Action,
|
|
round_robin_counter: &AtomicUsize,
|
|
) -> Result<attune_common::models::Worker> {
|
|
// Get runtime requirements for the action
|
|
let runtime = if let Some(runtime_id) = action.runtime {
|
|
RuntimeRepository::find_by_id(pool, runtime_id).await?
|
|
} else {
|
|
None
|
|
};
|
|
|
|
// Find available action workers (role = 'action')
|
|
let workers = WorkerRepository::find_action_workers(pool).await?;
|
|
|
|
if workers.is_empty() {
|
|
return Err(anyhow::anyhow!("No action workers available"));
|
|
}
|
|
|
|
// Filter workers by runtime compatibility if runtime is specified
|
|
let compatible_workers: Vec<_> = if let Some(ref runtime) = runtime {
|
|
workers
|
|
.into_iter()
|
|
.filter(|w| Self::worker_supports_runtime(w, &runtime.name))
|
|
.collect()
|
|
} else {
|
|
workers
|
|
};
|
|
|
|
if compatible_workers.is_empty() {
|
|
let runtime_name = runtime.as_ref().map(|r| r.name.as_str()).unwrap_or("any");
|
|
return Err(anyhow::anyhow!(
|
|
"No compatible workers found for action: {} (requires runtime: {})",
|
|
action.r#ref,
|
|
runtime_name
|
|
));
|
|
}
|
|
|
|
// Filter by worker status (only active workers)
|
|
let active_workers: Vec<_> = compatible_workers
|
|
.into_iter()
|
|
.filter(|w| w.status == Some(attune_common::models::enums::WorkerStatus::Active))
|
|
.collect();
|
|
|
|
if active_workers.is_empty() {
|
|
return Err(anyhow::anyhow!("No active workers available"));
|
|
}
|
|
|
|
// Filter by heartbeat freshness (only workers with recent heartbeats)
|
|
let fresh_workers: Vec<_> = active_workers
|
|
.into_iter()
|
|
.filter(|w| Self::is_worker_heartbeat_fresh(w))
|
|
.collect();
|
|
|
|
if fresh_workers.is_empty() {
|
|
warn!("No workers with fresh heartbeats available. All active workers have stale heartbeats.");
|
|
return Err(anyhow::anyhow!(
|
|
"No workers with fresh heartbeats available (heartbeat older than {} seconds)",
|
|
DEFAULT_HEARTBEAT_INTERVAL * HEARTBEAT_STALENESS_MULTIPLIER
|
|
));
|
|
}
|
|
|
|
// Round-robin selection: distribute executions evenly across workers.
|
|
// Each call increments the counter and picks the next worker in the list.
|
|
let count = round_robin_counter.fetch_add(1, Ordering::Relaxed);
|
|
let index = count % fresh_workers.len();
|
|
let selected = fresh_workers
|
|
.into_iter()
|
|
.nth(index)
|
|
.expect("Worker list should not be empty");
|
|
|
|
info!(
|
|
"Selected worker {} (id={}) via round-robin (index {} of available workers)",
|
|
selected.name, selected.id, index
|
|
);
|
|
|
|
Ok(selected)
|
|
}
|
|
|
|
/// Check if a worker supports a given runtime
|
|
///
|
|
/// This checks the worker's capabilities.runtimes array for the runtime name.
|
|
/// Falls back to checking the deprecated runtime column if capabilities are not set.
|
|
fn worker_supports_runtime(worker: &attune_common::models::Worker, runtime_name: &str) -> bool {
|
|
// First, try to parse capabilities and check runtimes array
|
|
if let Some(ref capabilities) = worker.capabilities {
|
|
if let Some(runtimes) = capabilities.get("runtimes") {
|
|
if let Some(runtime_array) = runtimes.as_array() {
|
|
// Check if any runtime in the array matches (case-insensitive)
|
|
for runtime_value in runtime_array {
|
|
if let Some(runtime_str) = runtime_value.as_str() {
|
|
if runtime_str.eq_ignore_ascii_case(runtime_name) {
|
|
debug!(
|
|
"Worker {} supports runtime '{}' via capabilities",
|
|
worker.name, runtime_name
|
|
);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: check deprecated runtime column
|
|
// This is kept for backward compatibility but should be removed in the future
|
|
if worker.runtime.is_some() {
|
|
debug!(
|
|
"Worker {} using deprecated runtime column for matching",
|
|
worker.name
|
|
);
|
|
// Note: This fallback is incomplete because we'd need to look up the runtime name
|
|
// from the ID, which would require an async call. Since we're moving to capabilities,
|
|
// we'll just return false here and require workers to set capabilities properly.
|
|
}
|
|
|
|
debug!(
|
|
"Worker {} does not support runtime '{}'",
|
|
worker.name, runtime_name
|
|
);
|
|
false
|
|
}
|
|
|
|
/// Check if a worker's heartbeat is fresh enough to schedule work
|
|
///
|
|
/// A worker is considered fresh if its last heartbeat is within
|
|
/// HEARTBEAT_STALENESS_MULTIPLIER * HEARTBEAT_INTERVAL seconds.
|
|
fn is_worker_heartbeat_fresh(worker: &attune_common::models::Worker) -> bool {
|
|
let Some(last_heartbeat) = worker.last_heartbeat else {
|
|
warn!(
|
|
"Worker {} has no heartbeat recorded, considering stale",
|
|
worker.name
|
|
);
|
|
return false;
|
|
};
|
|
|
|
let now = Utc::now();
|
|
let age = now.signed_duration_since(last_heartbeat);
|
|
let max_age =
|
|
Duration::from_secs(DEFAULT_HEARTBEAT_INTERVAL * HEARTBEAT_STALENESS_MULTIPLIER);
|
|
|
|
let is_fresh = age.to_std().unwrap_or(Duration::MAX) <= max_age;
|
|
|
|
if !is_fresh {
|
|
warn!(
|
|
"Worker {} heartbeat is stale: last seen {} seconds ago (max: {} seconds)",
|
|
worker.name,
|
|
age.num_seconds(),
|
|
max_age.as_secs()
|
|
);
|
|
} else {
|
|
debug!(
|
|
"Worker {} heartbeat is fresh: last seen {} seconds ago",
|
|
worker.name,
|
|
age.num_seconds()
|
|
);
|
|
}
|
|
|
|
is_fresh
|
|
}
|
|
|
|
/// Queue execution to a specific worker
|
|
async fn queue_to_worker(
|
|
publisher: &Publisher,
|
|
execution_id: &i64,
|
|
worker_id: &i64,
|
|
action_ref: &str,
|
|
config: &Option<JsonValue>,
|
|
_action: &Action,
|
|
) -> Result<()> {
|
|
debug!("Queuing execution {} to worker {}", execution_id, worker_id);
|
|
|
|
// Create payload for worker
|
|
let payload = ExecutionScheduledPayload {
|
|
execution_id: *execution_id,
|
|
worker_id: *worker_id,
|
|
action_ref: action_ref.to_string(),
|
|
config: config.clone(),
|
|
};
|
|
|
|
let envelope =
|
|
MessageEnvelope::new(MessageType::ExecutionRequested, payload).with_source("executor");
|
|
|
|
// Publish to worker-specific queue with routing key
|
|
let routing_key = format!("execution.dispatch.worker.{}", worker_id);
|
|
let exchange = "attune.executions";
|
|
|
|
publisher
|
|
.publish_envelope_with_routing(&envelope, exchange, &routing_key)
|
|
.await?;
|
|
|
|
info!(
|
|
"Published execution.scheduled message to worker {} (routing key: {})",
|
|
worker_id, routing_key
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use attune_common::models::{Worker, WorkerRole, WorkerStatus, WorkerType};
|
|
use chrono::{Duration as ChronoDuration, Utc};
|
|
|
|
fn create_test_worker(name: &str, heartbeat_offset_secs: i64) -> Worker {
|
|
let last_heartbeat = if heartbeat_offset_secs == 0 {
|
|
None
|
|
} else {
|
|
Some(Utc::now() - ChronoDuration::seconds(heartbeat_offset_secs))
|
|
};
|
|
|
|
Worker {
|
|
id: 1,
|
|
name: name.to_string(),
|
|
worker_type: WorkerType::Local,
|
|
worker_role: WorkerRole::Action,
|
|
runtime: None,
|
|
host: Some("localhost".to_string()),
|
|
port: Some(8080),
|
|
status: Some(WorkerStatus::Active),
|
|
capabilities: Some(serde_json::json!({
|
|
"runtimes": ["shell", "python"]
|
|
})),
|
|
meta: None,
|
|
last_heartbeat,
|
|
created: Utc::now(),
|
|
updated: Utc::now(),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_heartbeat_freshness_with_recent_heartbeat() {
|
|
// Worker with heartbeat 30 seconds ago (within limit)
|
|
let worker = create_test_worker("test-worker", 30);
|
|
assert!(
|
|
ExecutionScheduler::is_worker_heartbeat_fresh(&worker),
|
|
"Worker with 30s old heartbeat should be considered fresh"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_heartbeat_freshness_with_stale_heartbeat() {
|
|
// Worker with heartbeat 100 seconds ago (beyond 3x30s = 90s limit)
|
|
let worker = create_test_worker("test-worker", 100);
|
|
assert!(
|
|
!ExecutionScheduler::is_worker_heartbeat_fresh(&worker),
|
|
"Worker with 100s old heartbeat should be considered stale"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_heartbeat_freshness_at_boundary() {
|
|
// Worker with heartbeat exactly at the 90 second boundary
|
|
let worker = create_test_worker("test-worker", 90);
|
|
assert!(
|
|
!ExecutionScheduler::is_worker_heartbeat_fresh(&worker),
|
|
"Worker with 90s old heartbeat should be considered stale (at boundary)"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_heartbeat_freshness_with_no_heartbeat() {
|
|
// Worker with no heartbeat recorded
|
|
let worker = create_test_worker("test-worker", 0);
|
|
assert!(
|
|
!ExecutionScheduler::is_worker_heartbeat_fresh(&worker),
|
|
"Worker with no heartbeat should be considered stale"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_heartbeat_freshness_with_very_recent() {
|
|
// Worker with heartbeat 5 seconds ago
|
|
let worker = create_test_worker("test-worker", 5);
|
|
assert!(
|
|
ExecutionScheduler::is_worker_heartbeat_fresh(&worker),
|
|
"Worker with 5s old heartbeat should be considered fresh"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_scheduler_creation() {
|
|
// This is a placeholder test
|
|
// Real tests will require database and message queue setup
|
|
assert!(true);
|
|
}
|
|
}
|