more internal polish, resilient workers
This commit is contained in:
495
crates/executor/src/retry_manager.rs
Normal file
495
crates/executor/src/retry_manager.rs
Normal file
@@ -0,0 +1,495 @@
|
||||
//! Retry Manager
|
||||
//!
|
||||
//! This module provides intelligent retry logic for failed executions.
|
||||
//! It determines whether failures are retriable, manages retry attempts,
|
||||
//! and implements exponential backoff for retry scheduling.
|
||||
//!
|
||||
//! # Retry Strategy
|
||||
//!
|
||||
//! - **Retriable Failures:** Worker unavailability, timeouts, transient errors
|
||||
//! - **Non-Retriable Failures:** Validation errors, missing actions, permission errors
|
||||
//! - **Backoff:** Exponential with jitter (1s, 2s, 4s, 8s, ...)
|
||||
//! - **Max Retries:** Configurable per action (default: 0, no retries)
|
||||
|
||||
use attune_common::{
|
||||
error::{Error, Result},
|
||||
models::{Execution, ExecutionStatus, Id},
|
||||
repositories::{
|
||||
execution::{CreateExecutionInput, UpdateExecutionInput},
|
||||
Create, ExecutionRepository, FindById, Update,
|
||||
},
|
||||
};
|
||||
use chrono::Utc;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use sqlx::PgPool;
|
||||
use std::time::Duration;
|
||||
use tracing::{debug, info};
|
||||
|
||||
/// Retry manager for execution failures
|
||||
pub struct RetryManager {
|
||||
/// Database connection pool
|
||||
pool: PgPool,
|
||||
/// Retry configuration
|
||||
config: RetryConfig,
|
||||
}
|
||||
|
||||
/// Retry configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct RetryConfig {
|
||||
/// Enable automatic retries
|
||||
pub enabled: bool,
|
||||
/// Base backoff duration in seconds
|
||||
pub base_backoff_secs: u64,
|
||||
/// Maximum backoff duration in seconds
|
||||
pub max_backoff_secs: u64,
|
||||
/// Backoff multiplier
|
||||
pub backoff_multiplier: f64,
|
||||
/// Add jitter to backoff (0.0 - 1.0)
|
||||
pub jitter_factor: f64,
|
||||
}
|
||||
|
||||
impl Default for RetryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
base_backoff_secs: 1,
|
||||
max_backoff_secs: 300, // 5 minutes
|
||||
backoff_multiplier: 2.0,
|
||||
jitter_factor: 0.2, // 20% jitter
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reason for retry
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum RetryReason {
|
||||
/// Worker was unavailable
|
||||
WorkerUnavailable,
|
||||
/// Execution timed out in queue
|
||||
QueueTimeout,
|
||||
/// Worker heartbeat became stale
|
||||
WorkerHeartbeatStale,
|
||||
/// Transient error in execution
|
||||
TransientError,
|
||||
/// Manual retry requested by user
|
||||
ManualRetry,
|
||||
/// Unknown/other reason
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl RetryReason {
|
||||
/// Get string representation
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::WorkerUnavailable => "worker_unavailable",
|
||||
Self::QueueTimeout => "queue_timeout",
|
||||
Self::WorkerHeartbeatStale => "worker_heartbeat_stale",
|
||||
Self::TransientError => "transient_error",
|
||||
Self::ManualRetry => "manual_retry",
|
||||
Self::Unknown => "unknown",
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect retry reason from execution error
|
||||
pub fn from_error(error: &str) -> Self {
|
||||
let error_lower = error.to_lowercase();
|
||||
|
||||
if error_lower.contains("worker queue ttl expired")
|
||||
|| error_lower.contains("worker unavailable")
|
||||
{
|
||||
Self::WorkerUnavailable
|
||||
} else if error_lower.contains("timeout") || error_lower.contains("timed out") {
|
||||
Self::QueueTimeout
|
||||
} else if error_lower.contains("heartbeat") || error_lower.contains("stale") {
|
||||
Self::WorkerHeartbeatStale
|
||||
} else if error_lower.contains("transient")
|
||||
|| error_lower.contains("temporary")
|
||||
|| error_lower.contains("connection")
|
||||
{
|
||||
Self::TransientError
|
||||
} else {
|
||||
Self::Unknown
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RetryReason {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of retry analysis
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct RetryAnalysis {
|
||||
/// Whether the execution should be retried
|
||||
pub should_retry: bool,
|
||||
/// Reason for retry decision
|
||||
pub reason: Option<RetryReason>,
|
||||
/// Suggested backoff delay
|
||||
pub backoff_delay: Option<Duration>,
|
||||
/// Current retry attempt (0-based)
|
||||
pub retry_count: i32,
|
||||
/// Maximum retry attempts allowed
|
||||
pub max_retries: i32,
|
||||
}
|
||||
|
||||
impl RetryManager {
|
||||
/// Create a new retry manager
|
||||
#[allow(dead_code)]
|
||||
pub fn new(pool: PgPool, config: RetryConfig) -> Self {
|
||||
Self { pool, config }
|
||||
}
|
||||
|
||||
/// Create with default configuration
|
||||
#[allow(dead_code)]
|
||||
pub fn with_defaults(pool: PgPool) -> Self {
|
||||
Self::new(pool, RetryConfig::default())
|
||||
}
|
||||
|
||||
/// Analyze if an execution should be retried
|
||||
#[allow(dead_code)]
|
||||
pub async fn analyze_execution(&self, execution_id: Id) -> Result<RetryAnalysis> {
|
||||
// Fetch execution
|
||||
let execution = ExecutionRepository::find_by_id(&self.pool, execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| Error::not_found("Execution", "id", execution_id.to_string()))?;
|
||||
|
||||
// Check if retries are enabled globally
|
||||
if !self.config.enabled {
|
||||
return Ok(RetryAnalysis {
|
||||
should_retry: false,
|
||||
reason: None,
|
||||
backoff_delay: None,
|
||||
retry_count: execution
|
||||
.config
|
||||
.as_ref()
|
||||
.and_then(|c| c.get("retry_count"))
|
||||
.and_then(|v| v.as_i64())
|
||||
.unwrap_or(0) as i32,
|
||||
max_retries: 0,
|
||||
});
|
||||
}
|
||||
|
||||
// Only retry failed executions
|
||||
if execution.status != ExecutionStatus::Failed {
|
||||
return Ok(RetryAnalysis {
|
||||
should_retry: false,
|
||||
reason: None,
|
||||
backoff_delay: None,
|
||||
retry_count: 0,
|
||||
max_retries: 0,
|
||||
});
|
||||
}
|
||||
|
||||
// Get retry metadata from execution config
|
||||
let config = execution.config.as_ref();
|
||||
let retry_count = config
|
||||
.and_then(|c| c.get("retry_count"))
|
||||
.and_then(|v: &serde_json::Value| v.as_i64())
|
||||
.unwrap_or(0) as i32;
|
||||
let max_retries = config
|
||||
.and_then(|c| c.get("max_retries"))
|
||||
.and_then(|v: &serde_json::Value| v.as_i64())
|
||||
.unwrap_or(0) as i32;
|
||||
let _original_execution = config
|
||||
.and_then(|c| c.get("original_execution"))
|
||||
.and_then(|v: &serde_json::Value| v.as_i64());
|
||||
|
||||
// Check if retries are exhausted
|
||||
if max_retries == 0 || retry_count >= max_retries {
|
||||
debug!(
|
||||
"Execution {} retry limit reached: {}/{}",
|
||||
execution_id, retry_count, max_retries
|
||||
);
|
||||
return Ok(RetryAnalysis {
|
||||
should_retry: false,
|
||||
reason: None,
|
||||
backoff_delay: None,
|
||||
retry_count,
|
||||
max_retries,
|
||||
});
|
||||
}
|
||||
|
||||
// Determine if failure is retriable
|
||||
let retry_reason = self.detect_retry_reason(&execution);
|
||||
let is_retriable = self.is_failure_retriable(&execution, retry_reason);
|
||||
|
||||
if !is_retriable {
|
||||
debug!(
|
||||
"Execution {} failure is not retriable: {:?}",
|
||||
execution_id, retry_reason
|
||||
);
|
||||
return Ok(RetryAnalysis {
|
||||
should_retry: false,
|
||||
reason: Some(retry_reason),
|
||||
backoff_delay: None,
|
||||
retry_count,
|
||||
max_retries,
|
||||
});
|
||||
}
|
||||
|
||||
// Calculate backoff delay
|
||||
let backoff_delay = self.calculate_backoff(retry_count);
|
||||
|
||||
info!(
|
||||
"Execution {} should be retried: attempt {}/{}, reason: {:?}, delay: {:?}",
|
||||
execution_id,
|
||||
retry_count + 1,
|
||||
max_retries,
|
||||
retry_reason,
|
||||
backoff_delay
|
||||
);
|
||||
|
||||
Ok(RetryAnalysis {
|
||||
should_retry: true,
|
||||
reason: Some(retry_reason),
|
||||
backoff_delay: Some(backoff_delay),
|
||||
retry_count,
|
||||
max_retries,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a retry execution from a failed execution
|
||||
#[allow(dead_code)]
|
||||
pub async fn create_retry_execution(
|
||||
&self,
|
||||
execution_id: Id,
|
||||
reason: RetryReason,
|
||||
) -> Result<Execution> {
|
||||
// Fetch original execution
|
||||
let original = ExecutionRepository::find_by_id(&self.pool, execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| Error::not_found("Execution", "id", execution_id.to_string()))?;
|
||||
|
||||
// Get retry metadata
|
||||
let config = original.config.as_ref();
|
||||
let retry_count = config
|
||||
.and_then(|c| c.get("retry_count"))
|
||||
.and_then(|v: &serde_json::Value| v.as_i64())
|
||||
.unwrap_or(0) as i32;
|
||||
let max_retries = config
|
||||
.and_then(|c| c.get("max_retries"))
|
||||
.and_then(|v: &serde_json::Value| v.as_i64())
|
||||
.unwrap_or(0) as i32;
|
||||
let original_execution_id = config
|
||||
.and_then(|c| c.get("original_execution"))
|
||||
.and_then(|v: &serde_json::Value| v.as_i64())
|
||||
.unwrap_or(execution_id);
|
||||
|
||||
// Create retry config
|
||||
let mut retry_config = original.config.clone().unwrap_or_else(|| json!({}));
|
||||
retry_config["retry_count"] = json!(retry_count + 1);
|
||||
retry_config["max_retries"] = json!(max_retries);
|
||||
retry_config["original_execution"] = json!(original_execution_id);
|
||||
retry_config["retry_reason"] = json!(reason.as_str());
|
||||
retry_config["retry_of"] = json!(execution_id);
|
||||
retry_config["retry_at"] = json!(Utc::now().to_rfc3339());
|
||||
|
||||
// Create new execution (reusing original parameters)
|
||||
let retry_execution = CreateExecutionInput {
|
||||
action: original.action,
|
||||
action_ref: original.action_ref.clone(),
|
||||
config: Some(retry_config),
|
||||
env_vars: original.env_vars.clone(),
|
||||
parent: original.parent,
|
||||
enforcement: original.enforcement,
|
||||
executor: None, // Will be assigned by scheduler
|
||||
status: ExecutionStatus::Requested,
|
||||
result: None,
|
||||
workflow_task: original.workflow_task.clone(),
|
||||
};
|
||||
|
||||
let created = ExecutionRepository::create(&self.pool, retry_execution).await?;
|
||||
|
||||
info!(
|
||||
"Created retry execution {} for original {} (attempt {}/{})",
|
||||
created.id,
|
||||
execution_id,
|
||||
retry_count + 1,
|
||||
max_retries
|
||||
);
|
||||
|
||||
Ok(created)
|
||||
}
|
||||
|
||||
/// Detect retry reason from execution
|
||||
fn detect_retry_reason(&self, execution: &Execution) -> RetryReason {
|
||||
if let Some(result) = &execution.result {
|
||||
if let Some(error) = result.get("error").and_then(|e| e.as_str()) {
|
||||
return RetryReason::from_error(error);
|
||||
}
|
||||
if let Some(message) = result.get("message").and_then(|m| m.as_str()) {
|
||||
return RetryReason::from_error(message);
|
||||
}
|
||||
}
|
||||
RetryReason::Unknown
|
||||
}
|
||||
|
||||
/// Check if failure is retriable
|
||||
fn is_failure_retriable(&self, _execution: &Execution, reason: RetryReason) -> bool {
|
||||
match reason {
|
||||
// These are retriable
|
||||
RetryReason::WorkerUnavailable => true,
|
||||
RetryReason::QueueTimeout => true,
|
||||
RetryReason::WorkerHeartbeatStale => true,
|
||||
RetryReason::TransientError => true,
|
||||
RetryReason::ManualRetry => true,
|
||||
// Unknown failures are not automatically retried
|
||||
RetryReason::Unknown => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate exponential backoff with jitter
|
||||
fn calculate_backoff(&self, retry_count: i32) -> Duration {
|
||||
let base_secs = self.config.base_backoff_secs as f64;
|
||||
let multiplier = self.config.backoff_multiplier;
|
||||
let max_secs = self.config.max_backoff_secs as f64;
|
||||
let jitter_factor = self.config.jitter_factor;
|
||||
|
||||
// Calculate exponential backoff: base * multiplier^retry_count
|
||||
let backoff_secs = base_secs * multiplier.powi(retry_count);
|
||||
|
||||
// Cap at max
|
||||
let backoff_secs = backoff_secs.min(max_secs);
|
||||
|
||||
// Add jitter: random value between (1 - jitter) and (1 + jitter)
|
||||
let jitter = 1.0 + (rand::random::<f64>() * 2.0 - 1.0) * jitter_factor;
|
||||
let backoff_with_jitter = backoff_secs * jitter;
|
||||
|
||||
Duration::from_secs(backoff_with_jitter.max(0.0) as u64)
|
||||
}
|
||||
|
||||
/// Update execution with retry metadata
|
||||
#[allow(dead_code)]
|
||||
pub async fn mark_as_retry(
|
||||
&self,
|
||||
execution_id: Id,
|
||||
original_execution_id: Id,
|
||||
retry_count: i32,
|
||||
reason: RetryReason,
|
||||
) -> Result<()> {
|
||||
let mut config = json!({
|
||||
"retry_count": retry_count,
|
||||
"original_execution": original_execution_id,
|
||||
"retry_reason": reason.as_str(),
|
||||
"retry_at": Utc::now().to_rfc3339(),
|
||||
});
|
||||
|
||||
// Fetch current config and merge
|
||||
if let Some(execution) = ExecutionRepository::find_by_id(&self.pool, execution_id).await? {
|
||||
if let Some(existing_config) = execution.config {
|
||||
if let Some(obj) = config.as_object_mut() {
|
||||
if let Some(existing_obj) = existing_config.as_object() {
|
||||
for (k, v) in existing_obj {
|
||||
obj.entry(k).or_insert(v.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ExecutionRepository::update(
|
||||
&self.pool,
|
||||
execution_id,
|
||||
UpdateExecutionInput {
|
||||
status: None,
|
||||
result: None,
|
||||
executor: None,
|
||||
workflow_task: None,
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if an error message indicates a retriable failure
|
||||
#[allow(dead_code)]
|
||||
pub fn is_error_retriable(error_msg: &str) -> bool {
|
||||
let error_lower = error_msg.to_lowercase();
|
||||
|
||||
// Retriable patterns
|
||||
error_lower.contains("worker queue ttl expired")
|
||||
|| error_lower.contains("worker unavailable")
|
||||
|| error_lower.contains("timeout")
|
||||
|| error_lower.contains("timed out")
|
||||
|| error_lower.contains("heartbeat")
|
||||
|| error_lower.contains("stale")
|
||||
|| error_lower.contains("transient")
|
||||
|| error_lower.contains("temporary")
|
||||
|| error_lower.contains("connection refused")
|
||||
|| error_lower.contains("connection reset")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_retry_reason_detection() {
|
||||
assert_eq!(
|
||||
RetryReason::from_error("Worker queue TTL expired"),
|
||||
RetryReason::WorkerUnavailable
|
||||
);
|
||||
assert_eq!(
|
||||
RetryReason::from_error("Execution timed out"),
|
||||
RetryReason::QueueTimeout
|
||||
);
|
||||
assert_eq!(
|
||||
RetryReason::from_error("Worker heartbeat is stale"),
|
||||
RetryReason::WorkerHeartbeatStale
|
||||
);
|
||||
assert_eq!(
|
||||
RetryReason::from_error("Transient connection error"),
|
||||
RetryReason::TransientError
|
||||
);
|
||||
assert_eq!(
|
||||
RetryReason::from_error("Invalid parameter format"),
|
||||
RetryReason::Unknown
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_error_retriable() {
|
||||
assert!(is_error_retriable("Worker queue TTL expired"));
|
||||
assert!(is_error_retriable("Execution timed out"));
|
||||
assert!(is_error_retriable("Worker heartbeat stale"));
|
||||
assert!(is_error_retriable("Transient error"));
|
||||
assert!(!is_error_retriable("Invalid parameter"));
|
||||
assert!(!is_error_retriable("Permission denied"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backoff_calculation() {
|
||||
let manager = RetryManager::with_defaults(
|
||||
// Mock pool - won't be used in this test
|
||||
unsafe { std::mem::zeroed() },
|
||||
);
|
||||
|
||||
let backoff0 = manager.calculate_backoff(0);
|
||||
let backoff1 = manager.calculate_backoff(1);
|
||||
let backoff2 = manager.calculate_backoff(2);
|
||||
|
||||
// First attempt: ~1s
|
||||
assert!(backoff0.as_secs() >= 0 && backoff0.as_secs() <= 2);
|
||||
// Second attempt: ~2s
|
||||
assert!(backoff1.as_secs() >= 1 && backoff1.as_secs() <= 3);
|
||||
// Third attempt: ~4s
|
||||
assert!(backoff2.as_secs() >= 2 && backoff2.as_secs() <= 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retry_config_defaults() {
|
||||
let config = RetryConfig::default();
|
||||
assert!(config.enabled);
|
||||
assert_eq!(config.base_backoff_secs, 1);
|
||||
assert_eq!(config.max_backoff_secs, 300);
|
||||
assert_eq!(config.backoff_multiplier, 2.0);
|
||||
assert_eq!(config.jitter_factor, 0.2);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user