Some checks failed
CI / Rust Blocking Checks (push) Failing after 22s
CI / Web Blocking Checks (push) Failing after 26s
CI / Security Blocking Checks (push) Successful in 9s
CI / Web Advisory Checks (push) Successful in 32s
CI / Security Advisory Checks (push) Has been cancelled
264 lines
8.5 KiB
Rust
264 lines
8.5 KiB
Rust
//! Dead Letter Handler
|
|
//!
|
|
//! This module handles messages that expire from worker queues and are routed to the
|
|
//! dead letter queue (DLQ). When a worker fails to process an execution request within
|
|
//! the configured TTL (default 5 minutes), the message is moved to the DLQ.
|
|
//!
|
|
//! The dead letter handler:
|
|
//! - Consumes messages from the dead letter queue
|
|
//! - Identifies the execution that expired
|
|
//! - Marks it as FAILED with appropriate error information
|
|
//! - Logs the failure for operational visibility
|
|
|
|
use attune_common::{
|
|
error::Error,
|
|
models::ExecutionStatus,
|
|
mq::{Consumer, ConsumerConfig, MessageEnvelope, MessageType, MqResult},
|
|
repositories::{execution::UpdateExecutionInput, ExecutionRepository, FindById, Update},
|
|
};
|
|
use chrono::Utc;
|
|
use serde_json::json;
|
|
use sqlx::PgPool;
|
|
use std::sync::Arc;
|
|
use tokio::sync::Mutex;
|
|
use tracing::{debug, error, info, warn};
|
|
|
|
/// Dead letter handler for processing expired messages
|
|
pub struct DeadLetterHandler {
|
|
/// Database connection pool
|
|
pool: Arc<PgPool>,
|
|
/// Message consumer
|
|
consumer: Consumer,
|
|
/// Running state
|
|
running: Arc<Mutex<bool>>,
|
|
}
|
|
|
|
impl DeadLetterHandler {
|
|
/// Create a new dead letter handler
|
|
pub async fn new(pool: Arc<PgPool>, consumer: Consumer) -> Result<Self, Error> {
|
|
Ok(Self {
|
|
pool,
|
|
consumer,
|
|
running: Arc::new(Mutex::new(false)),
|
|
})
|
|
}
|
|
|
|
/// Start the dead letter handler
|
|
pub async fn start(&self) -> Result<(), Error> {
|
|
info!(
|
|
"Starting dead letter handler for queue '{}'",
|
|
self.consumer.queue()
|
|
);
|
|
|
|
{
|
|
let mut running = self.running.lock().await;
|
|
if *running {
|
|
warn!("Dead letter handler already running");
|
|
return Ok(());
|
|
}
|
|
*running = true;
|
|
}
|
|
|
|
let pool = Arc::clone(&self.pool);
|
|
let running = Arc::clone(&self.running);
|
|
|
|
// Start consuming messages
|
|
let consumer_result = self
|
|
.consumer
|
|
.consume_with_handler(move |envelope: MessageEnvelope<serde_json::Value>| {
|
|
let pool = Arc::clone(&pool);
|
|
let running = Arc::clone(&running);
|
|
|
|
async move {
|
|
// Check if we should continue processing
|
|
{
|
|
let is_running = running.lock().await;
|
|
if !*is_running {
|
|
info!("Dead letter handler stopping, rejecting message");
|
|
return Err(attune_common::mq::MqError::Consume(
|
|
"Handler is shutting down".to_string(),
|
|
));
|
|
}
|
|
}
|
|
|
|
info!(
|
|
"Processing dead letter message {} of type {:?}",
|
|
envelope.message_id, envelope.message_type
|
|
);
|
|
|
|
match envelope.message_type {
|
|
MessageType::ExecutionRequested => {
|
|
handle_execution_requested(&pool, &envelope).await
|
|
}
|
|
_ => {
|
|
warn!(
|
|
"Received unexpected message type {:?} in DLQ: {}",
|
|
envelope.message_type, envelope.message_id
|
|
);
|
|
// Acknowledge unexpected messages to remove them from queue
|
|
Ok(())
|
|
}
|
|
}
|
|
}
|
|
})
|
|
.await;
|
|
|
|
{
|
|
let mut running = self.running.lock().await;
|
|
*running = false;
|
|
}
|
|
|
|
consumer_result.map_err(|e| {
|
|
error!("Dead letter handler error: {}", e);
|
|
Error::Internal(format!("Dead letter handler failed: {}", e))
|
|
})
|
|
}
|
|
|
|
/// Stop the dead letter handler
|
|
#[allow(dead_code)]
|
|
pub async fn stop(&self) {
|
|
info!("Stopping dead letter handler");
|
|
let mut running = self.running.lock().await;
|
|
*running = false;
|
|
}
|
|
|
|
/// Check if the handler is running
|
|
#[allow(dead_code)]
|
|
pub async fn is_running(&self) -> bool {
|
|
*self.running.lock().await
|
|
}
|
|
}
|
|
|
|
/// Handle an execution request that expired in a worker queue
|
|
async fn handle_execution_requested(
|
|
pool: &PgPool,
|
|
envelope: &MessageEnvelope<serde_json::Value>,
|
|
) -> MqResult<()> {
|
|
debug!(
|
|
"Handling expired ExecutionRequested message: {}",
|
|
envelope.message_id
|
|
);
|
|
|
|
// Extract execution ID from payload
|
|
let execution_id = match envelope.payload.get("execution_id") {
|
|
Some(id) => match id.as_i64() {
|
|
Some(id) => id,
|
|
None => {
|
|
error!("Invalid execution_id in payload: not an i64");
|
|
return Ok(()); // Acknowledge to remove from queue
|
|
}
|
|
},
|
|
None => {
|
|
error!("Missing execution_id in ExecutionRequested payload");
|
|
return Ok(()); // Acknowledge to remove from queue
|
|
}
|
|
};
|
|
|
|
info!(
|
|
"Failing execution {} due to worker queue expiration",
|
|
execution_id
|
|
);
|
|
|
|
// Fetch current execution state
|
|
let execution = match ExecutionRepository::find_by_id(pool, execution_id).await {
|
|
Ok(Some(exec)) => exec,
|
|
Ok(None) => {
|
|
warn!(
|
|
"Execution {} not found in database, may have been already processed",
|
|
execution_id
|
|
);
|
|
return Ok(()); // Acknowledge to remove from queue
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to fetch execution {}: {}", execution_id, e);
|
|
// Return error to nack and potentially retry
|
|
return Err(attune_common::mq::MqError::Consume(format!(
|
|
"Database error: {}",
|
|
e
|
|
)));
|
|
}
|
|
};
|
|
|
|
// Only fail if still in a non-terminal state
|
|
if !matches!(
|
|
execution.status,
|
|
ExecutionStatus::Scheduled | ExecutionStatus::Running
|
|
) {
|
|
info!(
|
|
"Execution {} already in terminal state {:?}, skipping",
|
|
execution_id, execution.status
|
|
);
|
|
return Ok(()); // Acknowledge to remove from queue
|
|
}
|
|
|
|
// Get worker info from payload for better error message
|
|
let worker_id = envelope.payload.get("worker_id").and_then(|v| v.as_i64());
|
|
|
|
let error_message = if let Some(wid) = worker_id {
|
|
format!(
|
|
"Execution expired in worker queue (worker_id: {}). Worker did not process the execution within the configured TTL. This typically indicates the worker is unavailable or overloaded.",
|
|
wid
|
|
)
|
|
} else {
|
|
"Execution expired in worker queue. Worker did not process the execution within the configured TTL.".to_string()
|
|
};
|
|
|
|
// Update execution to failed
|
|
let update_input = UpdateExecutionInput {
|
|
status: Some(ExecutionStatus::Failed),
|
|
result: Some(json!({
|
|
"error": "Worker queue TTL expired",
|
|
"message": error_message,
|
|
"expired_at": Utc::now().to_rfc3339(),
|
|
})),
|
|
..Default::default()
|
|
};
|
|
|
|
match ExecutionRepository::update(pool, execution_id, update_input).await {
|
|
Ok(_) => {
|
|
info!(
|
|
"Successfully failed execution {} due to worker queue expiration",
|
|
execution_id
|
|
);
|
|
Ok(())
|
|
}
|
|
Err(e) => {
|
|
error!(
|
|
"Failed to update execution {} to failed state: {}",
|
|
execution_id, e
|
|
);
|
|
// Return error to nack and potentially retry
|
|
Err(attune_common::mq::MqError::Consume(format!(
|
|
"Failed to update execution: {}",
|
|
e
|
|
)))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Create a dead letter consumer configuration
|
|
pub fn create_dlq_consumer_config(dlq_name: &str, consumer_tag: &str) -> ConsumerConfig {
|
|
ConsumerConfig {
|
|
queue: dlq_name.to_string(),
|
|
tag: consumer_tag.to_string(),
|
|
prefetch_count: 10,
|
|
auto_ack: false, // Manual ack for reliability
|
|
exclusive: false,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_create_dlq_consumer_config() {
|
|
let config = create_dlq_consumer_config("attune.dlx.queue", "dlq-handler");
|
|
assert_eq!(config.queue, "attune.dlx.queue");
|
|
assert_eq!(config.tag, "dlq-handler");
|
|
assert_eq!(config.prefetch_count, 10);
|
|
assert!(!config.auto_ack);
|
|
assert!(!config.exclusive);
|
|
}
|
|
}
|