Files
attune/crates/executor/src/service.rs

615 lines
21 KiB
Rust

//! Executor Service - Core orchestration and execution management
//!
//! The ExecutorService is the central component that:
//! - Processes enforcement messages from triggered rules
//! - Schedules executions to workers
//! - Manages execution lifecycle and state transitions
//! - Enforces execution policies (rate limiting, concurrency)
//! - Orchestrates workflows (parent-child executions)
//! - Handles human-in-the-loop inquiries
use anyhow::Result;
use attune_common::{
config::Config,
db::Database,
mq::{Connection, Consumer, MessageQueueConfig, Publisher},
};
use sqlx::PgPool;
use std::sync::Arc;
use tokio::task::JoinHandle;
use tracing::{error, info, warn};
use crate::completion_listener::CompletionListener;
use crate::dead_letter_handler::{create_dlq_consumer_config, DeadLetterHandler};
use crate::enforcement_processor::EnforcementProcessor;
use crate::event_processor::EventProcessor;
use crate::execution_manager::ExecutionManager;
use crate::inquiry_handler::InquiryHandler;
use crate::policy_enforcer::PolicyEnforcer;
use crate::queue_manager::{ExecutionQueueManager, QueueConfig};
use crate::scheduler::ExecutionScheduler;
use crate::timeout_monitor::{ExecutionTimeoutMonitor, TimeoutMonitorConfig};
/// Main executor service that orchestrates execution processing
#[derive(Clone)]
pub struct ExecutorService {
/// Shared internal state
inner: Arc<ExecutorServiceInner>,
}
/// Internal state for the executor service
struct ExecutorServiceInner {
/// Database connection pool
pool: PgPool,
/// Configuration
config: Arc<Config>,
/// Message queue connection
mq_connection: Arc<Connection>,
/// Message queue publisher
/// Publisher for sending messages
publisher: Arc<Publisher>,
/// Queue name for consumers
#[allow(dead_code)]
queue_name: String,
/// Message queue configuration
mq_config: Arc<MessageQueueConfig>,
/// Policy enforcer for execution policies
policy_enforcer: Arc<PolicyEnforcer>,
/// Queue manager for FIFO execution ordering
queue_manager: Arc<ExecutionQueueManager>,
/// Service shutdown signal
shutdown_tx: tokio::sync::broadcast::Sender<()>,
}
impl ExecutorService {
/// Create a new executor service
pub async fn new(config: Config) -> Result<Self> {
info!("Initializing Executor Service");
// Initialize database
let db = Database::new(&config.database).await?;
let pool = db.pool().clone();
info!("Database connection established");
// Get message queue URL
let mq_url = config
.message_queue
.as_ref()
.map(|mq| mq.url.as_str())
.ok_or_else(|| anyhow::anyhow!("Message queue configuration is required"))?;
// Initialize message queue connection
let mq_connection = Connection::connect(mq_url).await?;
info!("Message queue connection established");
// Setup common message queue infrastructure (exchanges and DLX)
let mq_config = MessageQueueConfig::default();
match mq_connection.setup_common_infrastructure(&mq_config).await {
Ok(_) => info!("Common message queue infrastructure setup completed"),
Err(e) => {
warn!(
"Failed to setup common MQ infrastructure (may already exist): {}",
e
);
}
}
// Setup executor-specific queues and bindings
match mq_connection
.setup_executor_infrastructure(&mq_config)
.await
{
Ok(_) => info!("Executor message queue infrastructure setup completed"),
Err(e) => {
warn!(
"Failed to setup executor MQ infrastructure (may already exist): {}",
e
);
}
}
// Get queue names from MqConfig
let enforcements_queue = mq_config.rabbitmq.queues.enforcements.name.clone();
let execution_requests_queue = mq_config.rabbitmq.queues.execution_requests.name.clone();
let execution_status_queue = mq_config.rabbitmq.queues.execution_status.name.clone();
let exchange_name = mq_config.rabbitmq.exchanges.executions.name.clone();
// Initialize message queue publisher
let publisher = Publisher::new(
&mq_connection,
attune_common::mq::PublisherConfig {
confirm_publish: true,
timeout_secs: 30,
exchange: exchange_name,
},
)
.await?;
info!("Message queue publisher initialized");
info!(
"Queue names - Enforcements: {}, Execution Requests: {}, Execution Status: {}",
enforcements_queue, execution_requests_queue, execution_status_queue
);
// Create shutdown channel
let (shutdown_tx, _) = tokio::sync::broadcast::channel(1);
// Initialize queue manager with default configuration and database pool
let queue_config = QueueConfig::default();
let queue_manager = Arc::new(ExecutionQueueManager::with_db_pool(
queue_config,
pool.clone(),
));
info!("Queue manager initialized with database persistence");
// Initialize policy enforcer with queue manager
let policy_enforcer = Arc::new(PolicyEnforcer::with_queue_manager(
pool.clone(),
queue_manager.clone(),
));
info!("Policy enforcer initialized with queue manager");
let inner = ExecutorServiceInner {
pool,
config: Arc::new(config),
mq_connection: Arc::new(mq_connection),
publisher: Arc::new(publisher),
queue_name: execution_requests_queue.clone(), // Keep for backward compatibility
policy_enforcer,
queue_manager,
shutdown_tx,
mq_config: Arc::new(mq_config),
};
Ok(Self {
inner: Arc::new(inner),
})
}
/// Start the executor service
pub async fn start(&self) -> Result<()> {
info!("Starting Executor Service");
// Spawn message consumers
let mut handles: Vec<JoinHandle<Result<()>>> = Vec::new();
// Start event processor with its own consumer
info!("Starting event processor...");
let events_queue = self.inner.mq_config.rabbitmq.queues.events.name.clone();
let event_consumer = Consumer::new(
&self.inner.mq_connection,
attune_common::mq::ConsumerConfig {
queue: events_queue,
tag: "executor.event".to_string(),
prefetch_count: 10,
auto_ack: false,
exclusive: false,
},
)
.await?;
let event_processor = EventProcessor::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
Arc::new(event_consumer),
);
handles.push(tokio::spawn(async move { event_processor.start().await }));
// Start completion listener with its own consumer
info!("Starting completion listener...");
let execution_completed_queue = self
.inner
.mq_config
.rabbitmq
.queues
.execution_completed
.name
.clone();
let completion_consumer = Consumer::new(
&self.inner.mq_connection,
attune_common::mq::ConsumerConfig {
queue: execution_completed_queue,
tag: "executor.completion".to_string(),
prefetch_count: 10,
auto_ack: false,
exclusive: false,
},
)
.await?;
let completion_listener = CompletionListener::new(
self.inner.pool.clone(),
Arc::new(completion_consumer),
self.inner.publisher.clone(),
self.inner.queue_manager.clone(),
);
handles.push(tokio::spawn(
async move { completion_listener.start().await },
));
// Start enforcement processor with its own consumer
info!("Starting enforcement processor...");
let enforcements_queue = self
.inner
.mq_config
.rabbitmq
.queues
.enforcements
.name
.clone();
let enforcement_consumer = Consumer::new(
&self.inner.mq_connection,
attune_common::mq::ConsumerConfig {
queue: enforcements_queue,
tag: "executor.enforcement".to_string(),
prefetch_count: 10,
auto_ack: false,
exclusive: false,
},
)
.await?;
let enforcement_processor = EnforcementProcessor::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
Arc::new(enforcement_consumer),
self.inner.policy_enforcer.clone(),
self.inner.queue_manager.clone(),
);
handles.push(tokio::spawn(
async move { enforcement_processor.start().await },
));
// Start execution scheduler with its own consumer
info!("Starting execution scheduler...");
let execution_requests_queue = self
.inner
.mq_config
.rabbitmq
.queues
.execution_requests
.name
.clone();
let scheduler_consumer = Consumer::new(
&self.inner.mq_connection,
attune_common::mq::ConsumerConfig {
queue: execution_requests_queue,
tag: "executor.scheduler".to_string(),
prefetch_count: 10,
auto_ack: false,
exclusive: false,
},
)
.await?;
let scheduler = ExecutionScheduler::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
Arc::new(scheduler_consumer),
);
handles.push(tokio::spawn(async move { scheduler.start().await }));
// Start execution manager with its own consumer
info!("Starting execution manager...");
let execution_status_queue = self
.inner
.mq_config
.rabbitmq
.queues
.execution_status
.name
.clone();
let manager_consumer = Consumer::new(
&self.inner.mq_connection,
attune_common::mq::ConsumerConfig {
queue: execution_status_queue,
tag: "executor.manager".to_string(),
prefetch_count: 10,
auto_ack: false,
exclusive: false,
},
)
.await?;
let execution_manager = ExecutionManager::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
Arc::new(manager_consumer),
);
handles.push(tokio::spawn(async move { execution_manager.start().await }));
// Start inquiry handler with its own consumer
info!("Starting inquiry handler...");
let inquiry_response_queue = self
.inner
.mq_config
.rabbitmq
.queues
.inquiry_responses
.name
.clone();
let inquiry_consumer = Consumer::new(
&self.inner.mq_connection,
attune_common::mq::ConsumerConfig {
queue: inquiry_response_queue,
tag: "executor.inquiry".to_string(),
prefetch_count: 10,
auto_ack: false,
exclusive: false,
},
)
.await?;
let inquiry_handler = InquiryHandler::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
Arc::new(inquiry_consumer),
);
handles.push(tokio::spawn(async move { inquiry_handler.start().await }));
// Start inquiry timeout checker
info!("Starting inquiry timeout checker...");
let timeout_pool = self.inner.pool.clone();
handles.push(tokio::spawn(async move {
InquiryHandler::timeout_check_loop(timeout_pool, 60).await;
Ok(())
}));
// Start worker heartbeat monitor
info!("Starting worker heartbeat monitor...");
let worker_pool = self.inner.pool.clone();
handles.push(tokio::spawn(async move {
Self::worker_heartbeat_monitor_loop(worker_pool, 60).await;
Ok(())
}));
// Start execution timeout monitor
info!("Starting execution timeout monitor...");
let timeout_config = TimeoutMonitorConfig {
scheduled_timeout: std::time::Duration::from_secs(
self.inner
.config
.executor
.as_ref()
.and_then(|e| e.scheduled_timeout)
.unwrap_or(300), // Default: 5 minutes
),
check_interval: std::time::Duration::from_secs(
self.inner
.config
.executor
.as_ref()
.and_then(|e| e.timeout_check_interval)
.unwrap_or(60), // Default: 1 minute
),
enabled: self
.inner
.config
.executor
.as_ref()
.and_then(|e| e.enable_timeout_monitor)
.unwrap_or(true), // Default: enabled
};
let timeout_monitor = Arc::new(ExecutionTimeoutMonitor::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
timeout_config,
));
handles.push(tokio::spawn(async move { timeout_monitor.start().await }));
// Start dead letter handler (if DLQ is enabled)
if self.inner.mq_config.rabbitmq.dead_letter.enabled {
info!("Starting dead letter handler...");
let dlq_name = format!(
"{}.queue",
self.inner.mq_config.rabbitmq.dead_letter.exchange
);
let dlq_consumer = Consumer::new(
&self.inner.mq_connection,
create_dlq_consumer_config(&dlq_name, "executor.dlq"),
)
.await?;
let dlq_handler = Arc::new(
DeadLetterHandler::new(Arc::new(self.inner.pool.clone()), dlq_consumer)
.await
.map_err(|e| anyhow::anyhow!("Failed to create DLQ handler: {}", e))?,
);
handles.push(tokio::spawn(async move {
dlq_handler
.start()
.await
.map_err(|e| anyhow::anyhow!("DLQ handler error: {}", e))
}));
} else {
info!("Dead letter queue is disabled, skipping DLQ handler");
}
info!("Executor Service started successfully");
info!("All processors are listening for messages...");
// Wait for shutdown signal
let mut shutdown_rx = self.inner.shutdown_tx.subscribe();
tokio::select! {
_ = shutdown_rx.recv() => {
info!("Shutdown signal received");
}
result = Self::wait_for_tasks(handles) => {
match result {
Ok(_) => info!("All tasks completed"),
Err(e) => error!("Task error: {}", e),
}
}
}
Ok(())
}
/// Stop the executor service
pub async fn stop(&self) -> Result<()> {
info!("Stopping Executor Service");
// Send shutdown signal
let _ = self.inner.shutdown_tx.send(());
// Close message queue connection (will close publisher and consumer)
self.inner.mq_connection.close().await?;
// Close database connections
self.inner.pool.close().await;
info!("Executor Service stopped");
Ok(())
}
/// Worker heartbeat monitor loop
///
/// Periodically checks for stale workers and marks them as inactive
async fn worker_heartbeat_monitor_loop(pool: PgPool, interval_secs: u64) {
use attune_common::models::enums::WorkerStatus;
use attune_common::repositories::{
runtime::{UpdateWorkerInput, WorkerRepository},
Update,
};
use chrono::Utc;
use std::time::Duration;
let check_interval = Duration::from_secs(interval_secs);
// Heartbeat staleness threshold: 3x the expected interval (90 seconds)
// NOTE: These constants MUST match DEFAULT_HEARTBEAT_INTERVAL and
// HEARTBEAT_STALENESS_MULTIPLIER in scheduler.rs to ensure consistency
const HEARTBEAT_INTERVAL: u64 = 30;
const STALENESS_MULTIPLIER: u64 = 3;
let max_age_secs = HEARTBEAT_INTERVAL * STALENESS_MULTIPLIER;
info!(
"Worker heartbeat monitor started (check interval: {}s, staleness threshold: {}s)",
interval_secs, max_age_secs
);
loop {
tokio::time::sleep(check_interval).await;
// Get all active workers
match WorkerRepository::find_by_status(&pool, WorkerStatus::Active).await {
Ok(workers) => {
let now = Utc::now();
let mut deactivated_count = 0;
for worker in workers {
// Check if worker has a heartbeat
let Some(last_heartbeat) = worker.last_heartbeat else {
warn!(
"Worker {} (ID: {}) has no heartbeat, marking as inactive",
worker.name, worker.id
);
if let Err(e) = WorkerRepository::update(
&pool,
worker.id,
UpdateWorkerInput {
status: Some(WorkerStatus::Inactive),
..Default::default()
},
)
.await
{
error!(
"Failed to deactivate worker {} (no heartbeat): {}",
worker.name, e
);
} else {
deactivated_count += 1;
}
continue;
};
// Check if heartbeat is stale
let age = now.signed_duration_since(last_heartbeat);
let age_secs = age.num_seconds();
if age_secs > max_age_secs as i64 {
warn!(
"Worker {} (ID: {}) heartbeat is stale ({}s old), marking as inactive",
worker.name, worker.id, age_secs
);
if let Err(e) = WorkerRepository::update(
&pool,
worker.id,
UpdateWorkerInput {
status: Some(WorkerStatus::Inactive),
..Default::default()
},
)
.await
{
error!(
"Failed to deactivate worker {} (stale heartbeat): {}",
worker.name, e
);
} else {
deactivated_count += 1;
}
}
}
if deactivated_count > 0 {
info!(
"Deactivated {} worker(s) with stale heartbeats",
deactivated_count
);
}
}
Err(e) => {
error!("Failed to query active workers for heartbeat check: {}", e);
}
}
}
}
/// Wait for all tasks to complete
async fn wait_for_tasks(handles: Vec<JoinHandle<Result<()>>>) -> Result<()> {
for handle in handles {
if let Err(e) = handle.await {
error!("Task panicked: {}", e);
}
}
Ok(())
}
/// Get database pool reference
#[allow(dead_code)]
pub fn pool(&self) -> &PgPool {
&self.inner.pool
}
/// Get config reference
#[allow(dead_code)]
pub fn config(&self) -> &Config {
&self.inner.config
}
/// Get publisher reference
#[allow(dead_code)]
pub fn publisher(&self) -> &Publisher {
&self.inner.publisher
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
#[ignore] // Requires database and RabbitMQ
async fn test_service_creation() {
let config = Config::load().expect("Failed to load config");
let service = ExecutorService::new(config).await;
assert!(service.is_ok());
}
}