more internal polish, resilient workers

This commit is contained in:
2026-02-09 18:32:34 -06:00
parent 588b319fec
commit e31ecb781b
62 changed files with 9872 additions and 584 deletions

View File

@@ -0,0 +1,495 @@
//! Retry Manager
//!
//! This module provides intelligent retry logic for failed executions.
//! It determines whether failures are retriable, manages retry attempts,
//! and implements exponential backoff for retry scheduling.
//!
//! # Retry Strategy
//!
//! - **Retriable Failures:** Worker unavailability, timeouts, transient errors
//! - **Non-Retriable Failures:** Validation errors, missing actions, permission errors
//! - **Backoff:** Exponential with jitter (1s, 2s, 4s, 8s, ...)
//! - **Max Retries:** Configurable per action (default: 0, no retries)
use attune_common::{
error::{Error, Result},
models::{Execution, ExecutionStatus, Id},
repositories::{
execution::{CreateExecutionInput, UpdateExecutionInput},
Create, ExecutionRepository, FindById, Update,
},
};
use chrono::Utc;
use serde::{Deserialize, Serialize};
use serde_json::json;
use sqlx::PgPool;
use std::time::Duration;
use tracing::{debug, info};
/// Retry manager for execution failures
pub struct RetryManager {
/// Database connection pool
pool: PgPool,
/// Retry configuration
config: RetryConfig,
}
/// Retry configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetryConfig {
/// Enable automatic retries
pub enabled: bool,
/// Base backoff duration in seconds
pub base_backoff_secs: u64,
/// Maximum backoff duration in seconds
pub max_backoff_secs: u64,
/// Backoff multiplier
pub backoff_multiplier: f64,
/// Add jitter to backoff (0.0 - 1.0)
pub jitter_factor: f64,
}
impl Default for RetryConfig {
fn default() -> Self {
Self {
enabled: true,
base_backoff_secs: 1,
max_backoff_secs: 300, // 5 minutes
backoff_multiplier: 2.0,
jitter_factor: 0.2, // 20% jitter
}
}
}
/// Reason for retry
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RetryReason {
/// Worker was unavailable
WorkerUnavailable,
/// Execution timed out in queue
QueueTimeout,
/// Worker heartbeat became stale
WorkerHeartbeatStale,
/// Transient error in execution
TransientError,
/// Manual retry requested by user
ManualRetry,
/// Unknown/other reason
Unknown,
}
impl RetryReason {
/// Get string representation
pub fn as_str(&self) -> &'static str {
match self {
Self::WorkerUnavailable => "worker_unavailable",
Self::QueueTimeout => "queue_timeout",
Self::WorkerHeartbeatStale => "worker_heartbeat_stale",
Self::TransientError => "transient_error",
Self::ManualRetry => "manual_retry",
Self::Unknown => "unknown",
}
}
/// Detect retry reason from execution error
pub fn from_error(error: &str) -> Self {
let error_lower = error.to_lowercase();
if error_lower.contains("worker queue ttl expired")
|| error_lower.contains("worker unavailable")
{
Self::WorkerUnavailable
} else if error_lower.contains("timeout") || error_lower.contains("timed out") {
Self::QueueTimeout
} else if error_lower.contains("heartbeat") || error_lower.contains("stale") {
Self::WorkerHeartbeatStale
} else if error_lower.contains("transient")
|| error_lower.contains("temporary")
|| error_lower.contains("connection")
{
Self::TransientError
} else {
Self::Unknown
}
}
}
impl std::fmt::Display for RetryReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
/// Result of retry analysis
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct RetryAnalysis {
/// Whether the execution should be retried
pub should_retry: bool,
/// Reason for retry decision
pub reason: Option<RetryReason>,
/// Suggested backoff delay
pub backoff_delay: Option<Duration>,
/// Current retry attempt (0-based)
pub retry_count: i32,
/// Maximum retry attempts allowed
pub max_retries: i32,
}
impl RetryManager {
/// Create a new retry manager
#[allow(dead_code)]
pub fn new(pool: PgPool, config: RetryConfig) -> Self {
Self { pool, config }
}
/// Create with default configuration
#[allow(dead_code)]
pub fn with_defaults(pool: PgPool) -> Self {
Self::new(pool, RetryConfig::default())
}
/// Analyze if an execution should be retried
#[allow(dead_code)]
pub async fn analyze_execution(&self, execution_id: Id) -> Result<RetryAnalysis> {
// Fetch execution
let execution = ExecutionRepository::find_by_id(&self.pool, execution_id)
.await?
.ok_or_else(|| Error::not_found("Execution", "id", execution_id.to_string()))?;
// Check if retries are enabled globally
if !self.config.enabled {
return Ok(RetryAnalysis {
should_retry: false,
reason: None,
backoff_delay: None,
retry_count: execution
.config
.as_ref()
.and_then(|c| c.get("retry_count"))
.and_then(|v| v.as_i64())
.unwrap_or(0) as i32,
max_retries: 0,
});
}
// Only retry failed executions
if execution.status != ExecutionStatus::Failed {
return Ok(RetryAnalysis {
should_retry: false,
reason: None,
backoff_delay: None,
retry_count: 0,
max_retries: 0,
});
}
// Get retry metadata from execution config
let config = execution.config.as_ref();
let retry_count = config
.and_then(|c| c.get("retry_count"))
.and_then(|v: &serde_json::Value| v.as_i64())
.unwrap_or(0) as i32;
let max_retries = config
.and_then(|c| c.get("max_retries"))
.and_then(|v: &serde_json::Value| v.as_i64())
.unwrap_or(0) as i32;
let _original_execution = config
.and_then(|c| c.get("original_execution"))
.and_then(|v: &serde_json::Value| v.as_i64());
// Check if retries are exhausted
if max_retries == 0 || retry_count >= max_retries {
debug!(
"Execution {} retry limit reached: {}/{}",
execution_id, retry_count, max_retries
);
return Ok(RetryAnalysis {
should_retry: false,
reason: None,
backoff_delay: None,
retry_count,
max_retries,
});
}
// Determine if failure is retriable
let retry_reason = self.detect_retry_reason(&execution);
let is_retriable = self.is_failure_retriable(&execution, retry_reason);
if !is_retriable {
debug!(
"Execution {} failure is not retriable: {:?}",
execution_id, retry_reason
);
return Ok(RetryAnalysis {
should_retry: false,
reason: Some(retry_reason),
backoff_delay: None,
retry_count,
max_retries,
});
}
// Calculate backoff delay
let backoff_delay = self.calculate_backoff(retry_count);
info!(
"Execution {} should be retried: attempt {}/{}, reason: {:?}, delay: {:?}",
execution_id,
retry_count + 1,
max_retries,
retry_reason,
backoff_delay
);
Ok(RetryAnalysis {
should_retry: true,
reason: Some(retry_reason),
backoff_delay: Some(backoff_delay),
retry_count,
max_retries,
})
}
/// Create a retry execution from a failed execution
#[allow(dead_code)]
pub async fn create_retry_execution(
&self,
execution_id: Id,
reason: RetryReason,
) -> Result<Execution> {
// Fetch original execution
let original = ExecutionRepository::find_by_id(&self.pool, execution_id)
.await?
.ok_or_else(|| Error::not_found("Execution", "id", execution_id.to_string()))?;
// Get retry metadata
let config = original.config.as_ref();
let retry_count = config
.and_then(|c| c.get("retry_count"))
.and_then(|v: &serde_json::Value| v.as_i64())
.unwrap_or(0) as i32;
let max_retries = config
.and_then(|c| c.get("max_retries"))
.and_then(|v: &serde_json::Value| v.as_i64())
.unwrap_or(0) as i32;
let original_execution_id = config
.and_then(|c| c.get("original_execution"))
.and_then(|v: &serde_json::Value| v.as_i64())
.unwrap_or(execution_id);
// Create retry config
let mut retry_config = original.config.clone().unwrap_or_else(|| json!({}));
retry_config["retry_count"] = json!(retry_count + 1);
retry_config["max_retries"] = json!(max_retries);
retry_config["original_execution"] = json!(original_execution_id);
retry_config["retry_reason"] = json!(reason.as_str());
retry_config["retry_of"] = json!(execution_id);
retry_config["retry_at"] = json!(Utc::now().to_rfc3339());
// Create new execution (reusing original parameters)
let retry_execution = CreateExecutionInput {
action: original.action,
action_ref: original.action_ref.clone(),
config: Some(retry_config),
env_vars: original.env_vars.clone(),
parent: original.parent,
enforcement: original.enforcement,
executor: None, // Will be assigned by scheduler
status: ExecutionStatus::Requested,
result: None,
workflow_task: original.workflow_task.clone(),
};
let created = ExecutionRepository::create(&self.pool, retry_execution).await?;
info!(
"Created retry execution {} for original {} (attempt {}/{})",
created.id,
execution_id,
retry_count + 1,
max_retries
);
Ok(created)
}
/// Detect retry reason from execution
fn detect_retry_reason(&self, execution: &Execution) -> RetryReason {
if let Some(result) = &execution.result {
if let Some(error) = result.get("error").and_then(|e| e.as_str()) {
return RetryReason::from_error(error);
}
if let Some(message) = result.get("message").and_then(|m| m.as_str()) {
return RetryReason::from_error(message);
}
}
RetryReason::Unknown
}
/// Check if failure is retriable
fn is_failure_retriable(&self, _execution: &Execution, reason: RetryReason) -> bool {
match reason {
// These are retriable
RetryReason::WorkerUnavailable => true,
RetryReason::QueueTimeout => true,
RetryReason::WorkerHeartbeatStale => true,
RetryReason::TransientError => true,
RetryReason::ManualRetry => true,
// Unknown failures are not automatically retried
RetryReason::Unknown => false,
}
}
/// Calculate exponential backoff with jitter
fn calculate_backoff(&self, retry_count: i32) -> Duration {
let base_secs = self.config.base_backoff_secs as f64;
let multiplier = self.config.backoff_multiplier;
let max_secs = self.config.max_backoff_secs as f64;
let jitter_factor = self.config.jitter_factor;
// Calculate exponential backoff: base * multiplier^retry_count
let backoff_secs = base_secs * multiplier.powi(retry_count);
// Cap at max
let backoff_secs = backoff_secs.min(max_secs);
// Add jitter: random value between (1 - jitter) and (1 + jitter)
let jitter = 1.0 + (rand::random::<f64>() * 2.0 - 1.0) * jitter_factor;
let backoff_with_jitter = backoff_secs * jitter;
Duration::from_secs(backoff_with_jitter.max(0.0) as u64)
}
/// Update execution with retry metadata
#[allow(dead_code)]
pub async fn mark_as_retry(
&self,
execution_id: Id,
original_execution_id: Id,
retry_count: i32,
reason: RetryReason,
) -> Result<()> {
let mut config = json!({
"retry_count": retry_count,
"original_execution": original_execution_id,
"retry_reason": reason.as_str(),
"retry_at": Utc::now().to_rfc3339(),
});
// Fetch current config and merge
if let Some(execution) = ExecutionRepository::find_by_id(&self.pool, execution_id).await? {
if let Some(existing_config) = execution.config {
if let Some(obj) = config.as_object_mut() {
if let Some(existing_obj) = existing_config.as_object() {
for (k, v) in existing_obj {
obj.entry(k).or_insert(v.clone());
}
}
}
}
}
ExecutionRepository::update(
&self.pool,
execution_id,
UpdateExecutionInput {
status: None,
result: None,
executor: None,
workflow_task: None,
},
)
.await?;
Ok(())
}
}
/// Check if an error message indicates a retriable failure
#[allow(dead_code)]
pub fn is_error_retriable(error_msg: &str) -> bool {
let error_lower = error_msg.to_lowercase();
// Retriable patterns
error_lower.contains("worker queue ttl expired")
|| error_lower.contains("worker unavailable")
|| error_lower.contains("timeout")
|| error_lower.contains("timed out")
|| error_lower.contains("heartbeat")
|| error_lower.contains("stale")
|| error_lower.contains("transient")
|| error_lower.contains("temporary")
|| error_lower.contains("connection refused")
|| error_lower.contains("connection reset")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_retry_reason_detection() {
assert_eq!(
RetryReason::from_error("Worker queue TTL expired"),
RetryReason::WorkerUnavailable
);
assert_eq!(
RetryReason::from_error("Execution timed out"),
RetryReason::QueueTimeout
);
assert_eq!(
RetryReason::from_error("Worker heartbeat is stale"),
RetryReason::WorkerHeartbeatStale
);
assert_eq!(
RetryReason::from_error("Transient connection error"),
RetryReason::TransientError
);
assert_eq!(
RetryReason::from_error("Invalid parameter format"),
RetryReason::Unknown
);
}
#[test]
fn test_is_error_retriable() {
assert!(is_error_retriable("Worker queue TTL expired"));
assert!(is_error_retriable("Execution timed out"));
assert!(is_error_retriable("Worker heartbeat stale"));
assert!(is_error_retriable("Transient error"));
assert!(!is_error_retriable("Invalid parameter"));
assert!(!is_error_retriable("Permission denied"));
}
#[test]
fn test_backoff_calculation() {
let manager = RetryManager::with_defaults(
// Mock pool - won't be used in this test
unsafe { std::mem::zeroed() },
);
let backoff0 = manager.calculate_backoff(0);
let backoff1 = manager.calculate_backoff(1);
let backoff2 = manager.calculate_backoff(2);
// First attempt: ~1s
assert!(backoff0.as_secs() >= 0 && backoff0.as_secs() <= 2);
// Second attempt: ~2s
assert!(backoff1.as_secs() >= 1 && backoff1.as_secs() <= 3);
// Third attempt: ~4s
assert!(backoff2.as_secs() >= 2 && backoff2.as_secs() <= 6);
}
#[test]
fn test_retry_config_defaults() {
let config = RetryConfig::default();
assert!(config.enabled);
assert_eq!(config.base_backoff_secs, 1);
assert_eq!(config.max_backoff_secs, 300);
assert_eq!(config.backoff_multiplier, 2.0);
assert_eq!(config.jitter_factor, 0.2);
}
}