more internal polish, resilient workers

This commit is contained in:
2026-02-09 18:32:34 -06:00
parent 588b319fec
commit e31ecb781b
62 changed files with 9872 additions and 584 deletions

View File

@@ -20,6 +20,7 @@ use tokio::task::JoinHandle;
use tracing::{error, info, warn};
use crate::completion_listener::CompletionListener;
use crate::dead_letter_handler::{create_dlq_consumer_config, DeadLetterHandler};
use crate::enforcement_processor::EnforcementProcessor;
use crate::event_processor::EventProcessor;
use crate::execution_manager::ExecutionManager;
@@ -27,6 +28,7 @@ use crate::inquiry_handler::InquiryHandler;
use crate::policy_enforcer::PolicyEnforcer;
use crate::queue_manager::{ExecutionQueueManager, QueueConfig};
use crate::scheduler::ExecutionScheduler;
use crate::timeout_monitor::{ExecutionTimeoutMonitor, TimeoutMonitorConfig};
/// Main executor service that orchestrates execution processing
#[derive(Clone)]
@@ -355,6 +357,75 @@ impl ExecutorService {
Ok(())
}));
// Start worker heartbeat monitor
info!("Starting worker heartbeat monitor...");
let worker_pool = self.inner.pool.clone();
handles.push(tokio::spawn(async move {
Self::worker_heartbeat_monitor_loop(worker_pool, 60).await;
Ok(())
}));
// Start execution timeout monitor
info!("Starting execution timeout monitor...");
let timeout_config = TimeoutMonitorConfig {
scheduled_timeout: std::time::Duration::from_secs(
self.inner
.config
.executor
.as_ref()
.and_then(|e| e.scheduled_timeout)
.unwrap_or(300), // Default: 5 minutes
),
check_interval: std::time::Duration::from_secs(
self.inner
.config
.executor
.as_ref()
.and_then(|e| e.timeout_check_interval)
.unwrap_or(60), // Default: 1 minute
),
enabled: self
.inner
.config
.executor
.as_ref()
.and_then(|e| e.enable_timeout_monitor)
.unwrap_or(true), // Default: enabled
};
let timeout_monitor = Arc::new(ExecutionTimeoutMonitor::new(
self.inner.pool.clone(),
self.inner.publisher.clone(),
timeout_config,
));
handles.push(tokio::spawn(async move { timeout_monitor.start().await }));
// Start dead letter handler (if DLQ is enabled)
if self.inner.mq_config.rabbitmq.dead_letter.enabled {
info!("Starting dead letter handler...");
let dlq_name = format!(
"{}.queue",
self.inner.mq_config.rabbitmq.dead_letter.exchange
);
let dlq_consumer = Consumer::new(
&self.inner.mq_connection,
create_dlq_consumer_config(&dlq_name, "executor.dlq"),
)
.await?;
let dlq_handler = Arc::new(
DeadLetterHandler::new(Arc::new(self.inner.pool.clone()), dlq_consumer)
.await
.map_err(|e| anyhow::anyhow!("Failed to create DLQ handler: {}", e))?,
);
handles.push(tokio::spawn(async move {
dlq_handler
.start()
.await
.map_err(|e| anyhow::anyhow!("DLQ handler error: {}", e))
}));
} else {
info!("Dead letter queue is disabled, skipping DLQ handler");
}
info!("Executor Service started successfully");
info!("All processors are listening for messages...");
@@ -393,6 +464,113 @@ impl ExecutorService {
Ok(())
}
/// Worker heartbeat monitor loop
///
/// Periodically checks for stale workers and marks them as inactive
async fn worker_heartbeat_monitor_loop(pool: PgPool, interval_secs: u64) {
use attune_common::models::enums::WorkerStatus;
use attune_common::repositories::{
runtime::{UpdateWorkerInput, WorkerRepository},
Update,
};
use chrono::Utc;
use std::time::Duration;
let check_interval = Duration::from_secs(interval_secs);
// Heartbeat staleness threshold: 3x the expected interval (90 seconds)
// NOTE: These constants MUST match DEFAULT_HEARTBEAT_INTERVAL and
// HEARTBEAT_STALENESS_MULTIPLIER in scheduler.rs to ensure consistency
const HEARTBEAT_INTERVAL: u64 = 30;
const STALENESS_MULTIPLIER: u64 = 3;
let max_age_secs = HEARTBEAT_INTERVAL * STALENESS_MULTIPLIER;
info!(
"Worker heartbeat monitor started (check interval: {}s, staleness threshold: {}s)",
interval_secs, max_age_secs
);
loop {
tokio::time::sleep(check_interval).await;
// Get all active workers
match WorkerRepository::find_by_status(&pool, WorkerStatus::Active).await {
Ok(workers) => {
let now = Utc::now();
let mut deactivated_count = 0;
for worker in workers {
// Check if worker has a heartbeat
let Some(last_heartbeat) = worker.last_heartbeat else {
warn!(
"Worker {} (ID: {}) has no heartbeat, marking as inactive",
worker.name, worker.id
);
if let Err(e) = WorkerRepository::update(
&pool,
worker.id,
UpdateWorkerInput {
status: Some(WorkerStatus::Inactive),
..Default::default()
},
)
.await
{
error!(
"Failed to deactivate worker {} (no heartbeat): {}",
worker.name, e
);
} else {
deactivated_count += 1;
}
continue;
};
// Check if heartbeat is stale
let age = now.signed_duration_since(last_heartbeat);
let age_secs = age.num_seconds();
if age_secs > max_age_secs as i64 {
warn!(
"Worker {} (ID: {}) heartbeat is stale ({}s old), marking as inactive",
worker.name, worker.id, age_secs
);
if let Err(e) = WorkerRepository::update(
&pool,
worker.id,
UpdateWorkerInput {
status: Some(WorkerStatus::Inactive),
..Default::default()
},
)
.await
{
error!(
"Failed to deactivate worker {} (stale heartbeat): {}",
worker.name, e
);
} else {
deactivated_count += 1;
}
}
}
if deactivated_count > 0 {
info!(
"Deactivated {} worker(s) with stale heartbeats",
deactivated_count
);
}
}
Err(e) => {
error!("Failed to query active workers for heartbeat check: {}", e);
}
}
}
}
/// Wait for all tasks to complete
async fn wait_for_tasks(handles: Vec<JoinHandle<Result<()>>>) -> Result<()> {
for handle in handles {