Files
attune/crates/executor/src/dead_letter_handler.rs
David Culbreth 13749409cd
Some checks failed
CI / Rust Blocking Checks (push) Failing after 22s
CI / Web Blocking Checks (push) Failing after 26s
CI / Security Blocking Checks (push) Successful in 9s
CI / Web Advisory Checks (push) Successful in 32s
CI / Security Advisory Checks (push) Has been cancelled
making linters happy
2026-03-04 23:44:45 -06:00

264 lines
8.5 KiB
Rust

//! Dead Letter Handler
//!
//! This module handles messages that expire from worker queues and are routed to the
//! dead letter queue (DLQ). When a worker fails to process an execution request within
//! the configured TTL (default 5 minutes), the message is moved to the DLQ.
//!
//! The dead letter handler:
//! - Consumes messages from the dead letter queue
//! - Identifies the execution that expired
//! - Marks it as FAILED with appropriate error information
//! - Logs the failure for operational visibility
use attune_common::{
error::Error,
models::ExecutionStatus,
mq::{Consumer, ConsumerConfig, MessageEnvelope, MessageType, MqResult},
repositories::{execution::UpdateExecutionInput, ExecutionRepository, FindById, Update},
};
use chrono::Utc;
use serde_json::json;
use sqlx::PgPool;
use std::sync::Arc;
use tokio::sync::Mutex;
use tracing::{debug, error, info, warn};
/// Dead letter handler for processing expired messages
pub struct DeadLetterHandler {
/// Database connection pool
pool: Arc<PgPool>,
/// Message consumer
consumer: Consumer,
/// Running state
running: Arc<Mutex<bool>>,
}
impl DeadLetterHandler {
/// Create a new dead letter handler
pub async fn new(pool: Arc<PgPool>, consumer: Consumer) -> Result<Self, Error> {
Ok(Self {
pool,
consumer,
running: Arc::new(Mutex::new(false)),
})
}
/// Start the dead letter handler
pub async fn start(&self) -> Result<(), Error> {
info!(
"Starting dead letter handler for queue '{}'",
self.consumer.queue()
);
{
let mut running = self.running.lock().await;
if *running {
warn!("Dead letter handler already running");
return Ok(());
}
*running = true;
}
let pool = Arc::clone(&self.pool);
let running = Arc::clone(&self.running);
// Start consuming messages
let consumer_result = self
.consumer
.consume_with_handler(move |envelope: MessageEnvelope<serde_json::Value>| {
let pool = Arc::clone(&pool);
let running = Arc::clone(&running);
async move {
// Check if we should continue processing
{
let is_running = running.lock().await;
if !*is_running {
info!("Dead letter handler stopping, rejecting message");
return Err(attune_common::mq::MqError::Consume(
"Handler is shutting down".to_string(),
));
}
}
info!(
"Processing dead letter message {} of type {:?}",
envelope.message_id, envelope.message_type
);
match envelope.message_type {
MessageType::ExecutionRequested => {
handle_execution_requested(&pool, &envelope).await
}
_ => {
warn!(
"Received unexpected message type {:?} in DLQ: {}",
envelope.message_type, envelope.message_id
);
// Acknowledge unexpected messages to remove them from queue
Ok(())
}
}
}
})
.await;
{
let mut running = self.running.lock().await;
*running = false;
}
consumer_result.map_err(|e| {
error!("Dead letter handler error: {}", e);
Error::Internal(format!("Dead letter handler failed: {}", e))
})
}
/// Stop the dead letter handler
#[allow(dead_code)]
pub async fn stop(&self) {
info!("Stopping dead letter handler");
let mut running = self.running.lock().await;
*running = false;
}
/// Check if the handler is running
#[allow(dead_code)]
pub async fn is_running(&self) -> bool {
*self.running.lock().await
}
}
/// Handle an execution request that expired in a worker queue
async fn handle_execution_requested(
pool: &PgPool,
envelope: &MessageEnvelope<serde_json::Value>,
) -> MqResult<()> {
debug!(
"Handling expired ExecutionRequested message: {}",
envelope.message_id
);
// Extract execution ID from payload
let execution_id = match envelope.payload.get("execution_id") {
Some(id) => match id.as_i64() {
Some(id) => id,
None => {
error!("Invalid execution_id in payload: not an i64");
return Ok(()); // Acknowledge to remove from queue
}
},
None => {
error!("Missing execution_id in ExecutionRequested payload");
return Ok(()); // Acknowledge to remove from queue
}
};
info!(
"Failing execution {} due to worker queue expiration",
execution_id
);
// Fetch current execution state
let execution = match ExecutionRepository::find_by_id(pool, execution_id).await {
Ok(Some(exec)) => exec,
Ok(None) => {
warn!(
"Execution {} not found in database, may have been already processed",
execution_id
);
return Ok(()); // Acknowledge to remove from queue
}
Err(e) => {
error!("Failed to fetch execution {}: {}", execution_id, e);
// Return error to nack and potentially retry
return Err(attune_common::mq::MqError::Consume(format!(
"Database error: {}",
e
)));
}
};
// Only fail if still in a non-terminal state
if !matches!(
execution.status,
ExecutionStatus::Scheduled | ExecutionStatus::Running
) {
info!(
"Execution {} already in terminal state {:?}, skipping",
execution_id, execution.status
);
return Ok(()); // Acknowledge to remove from queue
}
// Get worker info from payload for better error message
let worker_id = envelope.payload.get("worker_id").and_then(|v| v.as_i64());
let error_message = if let Some(wid) = worker_id {
format!(
"Execution expired in worker queue (worker_id: {}). Worker did not process the execution within the configured TTL. This typically indicates the worker is unavailable or overloaded.",
wid
)
} else {
"Execution expired in worker queue. Worker did not process the execution within the configured TTL.".to_string()
};
// Update execution to failed
let update_input = UpdateExecutionInput {
status: Some(ExecutionStatus::Failed),
result: Some(json!({
"error": "Worker queue TTL expired",
"message": error_message,
"expired_at": Utc::now().to_rfc3339(),
})),
..Default::default()
};
match ExecutionRepository::update(pool, execution_id, update_input).await {
Ok(_) => {
info!(
"Successfully failed execution {} due to worker queue expiration",
execution_id
);
Ok(())
}
Err(e) => {
error!(
"Failed to update execution {} to failed state: {}",
execution_id, e
);
// Return error to nack and potentially retry
Err(attune_common::mq::MqError::Consume(format!(
"Failed to update execution: {}",
e
)))
}
}
}
/// Create a dead letter consumer configuration
pub fn create_dlq_consumer_config(dlq_name: &str, consumer_tag: &str) -> ConsumerConfig {
ConsumerConfig {
queue: dlq_name.to_string(),
tag: consumer_tag.to_string(),
prefetch_count: 10,
auto_ack: false, // Manual ack for reliability
exclusive: false,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_dlq_consumer_config() {
let config = create_dlq_consumer_config("attune.dlx.queue", "dlq-handler");
assert_eq!(config.queue, "attune.dlx.queue");
assert_eq!(config.tag, "dlq-handler");
assert_eq!(config.prefetch_count, 10);
assert!(!config.auto_ack);
assert!(!config.exclusive);
}
}