Files
attune/crates/executor/src/worker_health.rs
David Culbreth 13749409cd
Some checks failed
CI / Rust Blocking Checks (push) Failing after 22s
CI / Web Blocking Checks (push) Failing after 26s
CI / Security Blocking Checks (push) Successful in 9s
CI / Web Advisory Checks (push) Successful in 32s
CI / Security Advisory Checks (push) Has been cancelled
making linters happy
2026-03-04 23:44:45 -06:00

479 lines
15 KiB
Rust

//! Worker Health Probe
//!
//! This module provides proactive health checking for workers.
//! It tracks worker health metrics, detects degraded/unhealthy workers,
//! and provides health-aware worker selection.
//!
//! # Health States
//!
//! - **Healthy:** Worker is responsive and performing well
//! - **Degraded:** Worker is functional but showing signs of issues
//! - **Unhealthy:** Worker should not receive new executions
//!
//! # Health Metrics
//!
//! - Queue depth (from worker self-reporting)
//! - Consecutive failures
//! - Average execution time
//! - Heartbeat freshness
use attune_common::{
error::{Error, Result},
models::{Id, Worker, WorkerStatus},
repositories::{FindById, List, WorkerRepository},
};
use chrono::{DateTime, Duration, Utc};
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use std::sync::Arc;
use tracing::{debug, info, warn};
/// Worker health state
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum HealthStatus {
/// Worker is healthy and performing well
Healthy,
/// Worker is functional but showing issues
Degraded,
/// Worker should not receive new tasks
Unhealthy,
}
impl HealthStatus {
pub fn as_str(&self) -> &'static str {
match self {
Self::Healthy => "healthy",
Self::Degraded => "degraded",
Self::Unhealthy => "unhealthy",
}
}
}
impl std::fmt::Display for HealthStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
/// Worker health metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthMetrics {
/// Current health status
pub status: HealthStatus,
/// Last health check time
pub last_check: DateTime<Utc>,
/// Consecutive failures
pub consecutive_failures: u32,
/// Total executions handled
pub total_executions: u64,
/// Failed executions
pub failed_executions: u64,
/// Average execution time in milliseconds
pub average_execution_time_ms: u64,
/// Current queue depth (estimated)
pub queue_depth: u32,
}
impl Default for HealthMetrics {
fn default() -> Self {
Self {
status: HealthStatus::Healthy,
last_check: Utc::now(),
consecutive_failures: 0,
total_executions: 0,
failed_executions: 0,
average_execution_time_ms: 0,
queue_depth: 0,
}
}
}
/// Health probe configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthProbeConfig {
/// Enable health probing
pub enabled: bool,
/// Heartbeat staleness threshold in seconds
pub heartbeat_max_age_secs: u64,
/// Consecutive failures before marking degraded
pub degraded_threshold: u32,
/// Consecutive failures before marking unhealthy
pub unhealthy_threshold: u32,
/// Queue depth to consider degraded
pub queue_depth_degraded: u32,
/// Queue depth to consider unhealthy
pub queue_depth_unhealthy: u32,
/// Failure rate threshold for degraded (0.0 - 1.0)
pub failure_rate_degraded: f64,
/// Failure rate threshold for unhealthy (0.0 - 1.0)
pub failure_rate_unhealthy: f64,
}
impl Default for HealthProbeConfig {
fn default() -> Self {
Self {
enabled: true,
heartbeat_max_age_secs: 30,
degraded_threshold: 3,
unhealthy_threshold: 10,
queue_depth_degraded: 50,
queue_depth_unhealthy: 100,
failure_rate_degraded: 0.3, // 30%
failure_rate_unhealthy: 0.7, // 70%
}
}
}
/// Worker health probe
pub struct WorkerHealthProbe {
/// Database connection pool
pool: Arc<PgPool>,
/// Configuration
config: HealthProbeConfig,
}
impl WorkerHealthProbe {
/// Create a new health probe
#[allow(dead_code)]
pub fn new(pool: Arc<PgPool>, config: HealthProbeConfig) -> Self {
Self { pool, config }
}
/// Create with default configuration
#[allow(dead_code)]
pub fn with_defaults(pool: Arc<PgPool>) -> Self {
Self::new(pool, HealthProbeConfig::default())
}
/// Check health of a specific worker
#[allow(dead_code)]
pub async fn check_worker(&self, worker_id: Id) -> Result<HealthMetrics> {
let worker = WorkerRepository::find_by_id(&*self.pool, worker_id)
.await?
.ok_or_else(|| Error::not_found("Worker", "id", worker_id.to_string()))?;
self.evaluate_health(&worker)
}
/// Get all healthy workers
#[allow(dead_code)]
pub async fn get_healthy_workers(&self) -> Result<Vec<Worker>> {
let workers = WorkerRepository::list(&*self.pool).await?;
let mut healthy = Vec::new();
for worker in workers {
if self.is_worker_healthy(&worker).await {
healthy.push(worker);
}
}
Ok(healthy)
}
/// Get workers sorted by health (healthiest first)
#[allow(dead_code)]
pub async fn get_workers_by_health(&self) -> Result<Vec<(Worker, HealthMetrics)>> {
let workers = WorkerRepository::list(&*self.pool).await?;
let mut worker_health = Vec::new();
for worker in workers {
match self.evaluate_health(&worker) {
Ok(metrics) => worker_health.push((worker, metrics)),
Err(e) => warn!("Failed to evaluate health for worker {}: {}", worker.id, e),
}
}
// Sort by health status (healthy first), then by queue depth
worker_health.sort_by(|a, b| match (a.1.status, b.1.status) {
(HealthStatus::Healthy, HealthStatus::Healthy) => a.1.queue_depth.cmp(&b.1.queue_depth),
(HealthStatus::Healthy, _) => std::cmp::Ordering::Less,
(_, HealthStatus::Healthy) => std::cmp::Ordering::Greater,
(HealthStatus::Degraded, HealthStatus::Degraded) => {
a.1.queue_depth.cmp(&b.1.queue_depth)
}
(HealthStatus::Degraded, HealthStatus::Unhealthy) => std::cmp::Ordering::Less,
(HealthStatus::Unhealthy, HealthStatus::Degraded) => std::cmp::Ordering::Greater,
(HealthStatus::Unhealthy, HealthStatus::Unhealthy) => {
a.1.queue_depth.cmp(&b.1.queue_depth)
}
});
Ok(worker_health)
}
/// Check if worker is healthy (simple boolean check)
#[allow(dead_code)]
pub async fn is_worker_healthy(&self, worker: &Worker) -> bool {
// Check basic status
if worker.status != Some(WorkerStatus::Active) {
return false;
}
// Check heartbeat freshness
if !self.is_heartbeat_fresh(worker) {
return false;
}
// Evaluate detailed health
match self.evaluate_health(worker) {
Ok(metrics) => matches!(
metrics.status,
HealthStatus::Healthy | HealthStatus::Degraded
),
Err(_) => false,
}
}
/// Evaluate worker health based on metrics
fn evaluate_health(&self, worker: &Worker) -> Result<HealthMetrics> {
// Extract health metrics from capabilities
let metrics = self.extract_health_metrics(worker);
// Check heartbeat
if !self.is_heartbeat_fresh(worker) {
return Ok(HealthMetrics {
status: HealthStatus::Unhealthy,
..metrics
});
}
// Calculate failure rate
let failure_rate = if metrics.total_executions > 0 {
metrics.failed_executions as f64 / metrics.total_executions as f64
} else {
0.0
};
// Determine health status based on thresholds
let status = if metrics.consecutive_failures >= self.config.unhealthy_threshold
|| metrics.queue_depth >= self.config.queue_depth_unhealthy
|| failure_rate >= self.config.failure_rate_unhealthy
{
HealthStatus::Unhealthy
} else if metrics.consecutive_failures >= self.config.degraded_threshold
|| metrics.queue_depth >= self.config.queue_depth_degraded
|| failure_rate >= self.config.failure_rate_degraded
{
HealthStatus::Degraded
} else {
HealthStatus::Healthy
};
debug!(
"Worker {} health: {:?} (failures: {}, queue: {}, failure_rate: {:.2}%)",
worker.name,
status,
metrics.consecutive_failures,
metrics.queue_depth,
failure_rate * 100.0
);
Ok(HealthMetrics { status, ..metrics })
}
/// Check if worker heartbeat is fresh
fn is_heartbeat_fresh(&self, worker: &Worker) -> bool {
let Some(last_heartbeat) = worker.last_heartbeat else {
warn!("Worker {} has no heartbeat", worker.name);
return false;
};
let age = Utc::now() - last_heartbeat;
let max_age = Duration::seconds(self.config.heartbeat_max_age_secs as i64);
if age > max_age {
warn!(
"Worker {} heartbeat stale: {} seconds old (max: {})",
worker.name,
age.num_seconds(),
max_age.num_seconds()
);
return false;
}
true
}
/// Extract health metrics from worker capabilities
fn extract_health_metrics(&self, worker: &Worker) -> HealthMetrics {
extract_health_metrics(worker)
}
}
/// Extract health metrics from worker capabilities.
///
/// Extracted as a free function so it can be tested without a database pool.
fn extract_health_metrics(worker: &Worker) -> HealthMetrics {
let mut metrics = HealthMetrics {
last_check: Utc::now(),
..Default::default()
};
let Some(capabilities) = &worker.capabilities else {
return metrics;
};
let Some(health_obj) = capabilities.get("health") else {
return metrics;
};
// Extract metrics from health object
if let Some(status_str) = health_obj.get("status").and_then(|v| v.as_str()) {
metrics.status = match status_str {
"healthy" => HealthStatus::Healthy,
"degraded" => HealthStatus::Degraded,
"unhealthy" => HealthStatus::Unhealthy,
_ => HealthStatus::Healthy,
};
}
if let Some(last_check_str) = health_obj.get("last_check").and_then(|v| v.as_str()) {
if let Ok(last_check) = DateTime::parse_from_rfc3339(last_check_str) {
metrics.last_check = last_check.with_timezone(&Utc);
}
}
if let Some(failures) = health_obj
.get("consecutive_failures")
.and_then(|v| v.as_u64())
{
metrics.consecutive_failures = failures as u32;
}
if let Some(total) = health_obj.get("total_executions").and_then(|v| v.as_u64()) {
metrics.total_executions = total;
}
if let Some(failed) = health_obj.get("failed_executions").and_then(|v| v.as_u64()) {
metrics.failed_executions = failed;
}
if let Some(avg_time) = health_obj
.get("average_execution_time_ms")
.and_then(|v| v.as_u64())
{
metrics.average_execution_time_ms = avg_time;
}
if let Some(depth) = health_obj.get("queue_depth").and_then(|v| v.as_u64()) {
metrics.queue_depth = depth as u32;
}
metrics
}
impl WorkerHealthProbe {
/// Get recommended worker for execution based on health
#[allow(dead_code)]
pub async fn get_best_worker(&self, runtime_name: &str) -> Result<Option<Worker>> {
let workers_by_health = self.get_workers_by_health().await?;
// Filter by runtime and health
for (worker, metrics) in workers_by_health {
// Skip unhealthy workers
if metrics.status == HealthStatus::Unhealthy {
continue;
}
// Check runtime support
if self.worker_supports_runtime(&worker, runtime_name) {
info!(
"Selected worker {} (health: {:?}, queue: {}) for runtime '{}'",
worker.name, metrics.status, metrics.queue_depth, runtime_name
);
return Ok(Some(worker));
}
}
warn!("No healthy worker found for runtime '{}'", runtime_name);
Ok(None)
}
/// Check if worker supports a runtime
fn worker_supports_runtime(&self, worker: &Worker, runtime_name: &str) -> bool {
let Some(capabilities) = &worker.capabilities else {
return false;
};
let Some(runtimes) = capabilities.get("runtimes") else {
return false;
};
let Some(runtime_array) = runtimes.as_array() else {
return false;
};
runtime_array.iter().any(|v| {
v.as_str()
.is_some_and(|s| s.eq_ignore_ascii_case(runtime_name))
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_health_status_display() {
assert_eq!(HealthStatus::Healthy.to_string(), "healthy");
assert_eq!(HealthStatus::Degraded.to_string(), "degraded");
assert_eq!(HealthStatus::Unhealthy.to_string(), "unhealthy");
}
#[test]
fn test_default_health_metrics() {
let metrics = HealthMetrics::default();
assert_eq!(metrics.status, HealthStatus::Healthy);
assert_eq!(metrics.consecutive_failures, 0);
assert_eq!(metrics.queue_depth, 0);
}
#[test]
fn test_health_probe_config_defaults() {
let config = HealthProbeConfig::default();
assert!(config.enabled);
assert_eq!(config.heartbeat_max_age_secs, 30);
assert_eq!(config.degraded_threshold, 3);
assert_eq!(config.unhealthy_threshold, 10);
assert_eq!(config.queue_depth_degraded, 50);
assert_eq!(config.queue_depth_unhealthy, 100);
}
#[test]
fn test_extract_health_metrics() {
let worker = Worker {
id: 1,
name: "test-worker".to_string(),
worker_type: attune_common::models::WorkerType::Container,
worker_role: attune_common::models::WorkerRole::Action,
runtime: None,
host: None,
port: None,
status: Some(WorkerStatus::Active),
capabilities: Some(json!({
"health": {
"status": "degraded",
"consecutive_failures": 5,
"queue_depth": 25,
"total_executions": 100,
"failed_executions": 10
}
})),
meta: None,
last_heartbeat: Some(Utc::now()),
created: Utc::now(),
updated: Utc::now(),
};
let metrics = extract_health_metrics(&worker);
assert_eq!(metrics.status, HealthStatus::Degraded);
assert_eq!(metrics.consecutive_failures, 5);
assert_eq!(metrics.queue_depth, 25);
assert_eq!(metrics.total_executions, 100);
assert_eq!(metrics.failed_executions, 10);
}
}