ha executor

2026-04-02 17:15:59 -05:00
parent 8e91440f23
commit f93e9229d2
25 changed files with 2736 additions and 422 deletions
--- a/crates/executor/src/completion_listener.rs
+++ b/crates/executor/src/completion_listener.rs
@@ -157,7 +157,11 @@ impl CompletionListener {
                        "Failed to advance workflow for execution {}: {}",
                        execution_id, e
                    );
-                    // Continue processing — don't fail the entire completion
+                    if let Some(mq_err) = Self::retryable_mq_error(&e) {
+                        return Err(mq_err.into());
+                    }
+                    // Non-retryable workflow advancement errors are logged but
+                    // do not fail the entire completion processing path.
                }
            }

--- a/crates/executor/src/dead_letter_handler.rs
+++ b/crates/executor/src/dead_letter_handler.rs
@@ -14,7 +14,7 @@ use attune_common::{
    error::Error,
    models::ExecutionStatus,
    mq::{Consumer, ConsumerConfig, MessageEnvelope, MessageType, MqResult},
-    repositories::{execution::UpdateExecutionInput, ExecutionRepository, FindById, Update},
+    repositories::{execution::UpdateExecutionInput, ExecutionRepository, FindById},
 };
 use chrono::Utc;
 use serde_json::json;
@@ -179,13 +179,12 @@ async fn handle_execution_requested(
        }
    };

-    // Only fail if still in a non-terminal state
-    if !matches!(
-        execution.status,
-        ExecutionStatus::Scheduled | ExecutionStatus::Running
-    ) {
+    // Only scheduled executions are still legitimately owned by the scheduler.
+    // If the execution already moved to running or a terminal state, this DLQ
+    // delivery is stale and must not overwrite newer state.
+    if execution.status != ExecutionStatus::Scheduled {
        info!(
-            "Execution {} already in terminal state {:?}, skipping",
+            "Execution {} already left Scheduled state ({:?}), skipping stale DLQ handling",
            execution_id, execution.status
        );
        return Ok(()); // Acknowledge to remove from queue
@@ -193,6 +192,12 @@ async fn handle_execution_requested(

    // Get worker info from payload for better error message
    let worker_id = envelope.payload.get("worker_id").and_then(|v| v.as_i64());
+    let scheduled_attempt_updated_at = envelope
+        .payload
+        .get("scheduled_attempt_updated_at")
+        .and_then(|v| v.as_str())
+        .and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok())
+        .map(|dt| dt.with_timezone(&Utc));

    let error_message = if let Some(wid) = worker_id {
        format!(
@@ -214,24 +219,85 @@ async fn handle_execution_requested(
        ..Default::default()
    };

-    match ExecutionRepository::update(pool, execution_id, update_input).await {
-        Ok(_) => {
-            info!(
-                "Successfully failed execution {} due to worker queue expiration",
-                execution_id
-            );
-            Ok(())
+    if let Some(timestamp) = scheduled_attempt_updated_at {
+        // Guard on both status and the exact updated_at from when the execution was
+        // scheduled — prevents overwriting state that changed after this DLQ message
+        // was enqueued.
+        match ExecutionRepository::update_if_status_and_updated_at(
+            pool,
+            execution_id,
+            ExecutionStatus::Scheduled,
+            timestamp,
+            update_input,
+        )
+        .await
+        {
+            Ok(Some(_)) => {
+                info!(
+                    "Successfully failed execution {} due to worker queue expiration",
+                    execution_id
+                );
+                Ok(())
+            }
+            Ok(None) => {
+                info!(
+                    "Skipping DLQ failure for execution {} because it already left Scheduled state",
+                    execution_id
+                );
+                Ok(())
+            }
+            Err(e) => {
+                error!(
+                    "Failed to update execution {} to failed state: {}",
+                    execution_id, e
+                );
+                Err(attune_common::mq::MqError::Consume(format!(
+                    "Failed to update execution: {}",
+                    e
+                )))
+            }
        }
-        Err(e) => {
-            error!(
-                "Failed to update execution {} to failed state: {}",
-                execution_id, e
-            );
-            // Return error to nack and potentially retry
-            Err(attune_common::mq::MqError::Consume(format!(
-                "Failed to update execution: {}",
-                e
-            )))
+    } else {
+        // Fallback for DLQ messages that predate the scheduled_attempt_updated_at
+        // field. Use a status-only guard — same safety guarantee as the original code
+        // (never overwrites terminal or running state).
+        warn!(
+            "DLQ message for execution {} lacks scheduled_attempt_updated_at; \
+             falling back to status-only guard",
+            execution_id
+        );
+        match ExecutionRepository::update_if_status(
+            pool,
+            execution_id,
+            ExecutionStatus::Scheduled,
+            update_input,
+        )
+        .await
+        {
+            Ok(Some(_)) => {
+                info!(
+                    "Successfully failed execution {} due to worker queue expiration (status-only guard)",
+                    execution_id
+                );
+                Ok(())
+            }
+            Ok(None) => {
+                info!(
+                    "Skipping DLQ failure for execution {} because it already left Scheduled state",
+                    execution_id
+                );
+                Ok(())
+            }
+            Err(e) => {
+                error!(
+                    "Failed to update execution {} to failed state: {}",
+                    execution_id, e
+                );
+                Err(attune_common::mq::MqError::Consume(format!(
+                    "Failed to update execution: {}",
+                    e
+                )))
+            }
        }
    }
 }
--- a/crates/executor/src/enforcement_processor.rs
+++ b/crates/executor/src/enforcement_processor.rs
@@ -19,7 +19,7 @@ use attune_common::{
        event::{EnforcementRepository, EventRepository, UpdateEnforcementInput},
        execution::{CreateExecutionInput, ExecutionRepository},
        rule::RuleRepository,
-        Create, FindById,
+        FindById,
    },
 };

@@ -116,6 +116,14 @@ impl EnforcementProcessor {
            .await?
            .ok_or_else(|| anyhow::anyhow!("Enforcement not found: {}", enforcement_id))?;

+        if enforcement.status != EnforcementStatus::Created {
+            debug!(
+                "Enforcement {} already left Created state ({:?}), skipping duplicate processing",
+                enforcement_id, enforcement.status
+            );
+            return Ok(());
+        }
+
        // Fetch associated rule
        let rule = RuleRepository::find_by_id(
            pool,
@@ -135,7 +143,7 @@ impl EnforcementProcessor {

        // Evaluate whether to create execution
        if Self::should_create_execution(&enforcement, &rule, event.as_ref())? {
-            Self::create_execution(
+            let execution_created = Self::create_execution(
                pool,
                publisher,
                policy_enforcer,
@@ -145,10 +153,10 @@ impl EnforcementProcessor {
            )
            .await?;

-            // Update enforcement status to Processed after successful execution creation
-            EnforcementRepository::update_loaded(
+            let updated = EnforcementRepository::update_loaded_if_status(
                pool,
                &enforcement,
+                EnforcementStatus::Created,
                UpdateEnforcementInput {
                    status: Some(EnforcementStatus::Processed),
                    payload: None,
@@ -157,17 +165,27 @@ impl EnforcementProcessor {
            )
            .await?;

-            debug!("Updated enforcement {} status to Processed", enforcement_id);
+            if updated.is_some() {
+                debug!(
+                    "Updated enforcement {} status to Processed after {} execution path",
+                    enforcement_id,
+                    if execution_created {
+                        "new"
+                    } else {
+                        "idempotent"
+                    }
+                );
+            }
        } else {
            info!(
                "Skipping execution creation for enforcement: {}",
                enforcement_id
            );

-            // Update enforcement status to Disabled since it was not actionable
-            EnforcementRepository::update_loaded(
+            let updated = EnforcementRepository::update_loaded_if_status(
                pool,
                &enforcement,
+                EnforcementStatus::Created,
                UpdateEnforcementInput {
                    status: Some(EnforcementStatus::Disabled),
                    payload: None,
@@ -176,10 +194,12 @@ impl EnforcementProcessor {
            )
            .await?;

-            debug!(
-                "Updated enforcement {} status to Disabled (skipped)",
-                enforcement_id
-            );
+            if updated.is_some() {
+                debug!(
+                    "Updated enforcement {} status to Disabled (skipped)",
+                    enforcement_id
+                );
+            }
        }

        Ok(())
@@ -234,7 +254,7 @@ impl EnforcementProcessor {
        _queue_manager: &ExecutionQueueManager,
        enforcement: &Enforcement,
        rule: &Rule,
-    ) -> Result<()> {
+    ) -> Result<bool> {
        // Extract action ID — should_create_execution already verified it's Some,
        // but guard defensively here as well.
        let action_id = match rule.action {
@@ -275,44 +295,60 @@ impl EnforcementProcessor {
            workflow_task: None, // Non-workflow execution
        };

-        let execution = ExecutionRepository::create(pool, execution_input).await?;
+        let execution_result = ExecutionRepository::create_top_level_for_enforcement_if_absent(
+            pool,
+            execution_input,
+            enforcement.id,
+        )
+        .await?;
+        let execution = execution_result.execution;

-        info!(
-            "Created execution: {} for enforcement: {}",
-            execution.id, enforcement.id
-        );
+        if execution_result.created {
+            info!(
+                "Created execution: {} for enforcement: {}",
+                execution.id, enforcement.id
+            );
+        } else {
+            info!(
+                "Reusing execution: {} for enforcement: {}",
+                execution.id, enforcement.id
+            );
+        }

-        // Publish ExecutionRequested message
-        let payload = ExecutionRequestedPayload {
-            execution_id: execution.id,
-            action_id: Some(action_id),
-            action_ref: action_ref.clone(),
-            parent_id: None,
-            enforcement_id: Some(enforcement.id),
-            config: enforcement.config.clone(),
-        };
+        if execution_result.created
+            || execution.status == attune_common::models::enums::ExecutionStatus::Requested
+        {
+            let payload = ExecutionRequestedPayload {
+                execution_id: execution.id,
+                action_id: Some(action_id),
+                action_ref: action_ref.clone(),
+                parent_id: None,
+                enforcement_id: Some(enforcement.id),
+                config: execution.config.clone(),
+            };

-        let envelope =
-            MessageEnvelope::new(attune_common::mq::MessageType::ExecutionRequested, payload)
-                .with_source("executor");
+            let envelope =
+                MessageEnvelope::new(attune_common::mq::MessageType::ExecutionRequested, payload)
+                    .with_source("executor");

-        // Publish to execution requests queue with routing key
-        let routing_key = "execution.requested";
-        let exchange = "attune.executions";
+            // Publish to execution requests queue with routing key
+            let routing_key = "execution.requested";
+            let exchange = "attune.executions";

-        publisher
-            .publish_envelope_with_routing(&envelope, exchange, routing_key)
-            .await?;
+            publisher
+                .publish_envelope_with_routing(&envelope, exchange, routing_key)
+                .await?;

-        info!(
-            "Published execution.requested message for execution: {} (enforcement: {}, action: {})",
-            execution.id, enforcement.id, action_id
-        );
+            info!(
+                "Published execution.requested message for execution: {} (enforcement: {}, action: {})",
+                execution.id, enforcement.id, action_id
+            );
+        }

        // NOTE: Queue slot will be released when worker publishes execution.completed
        // and CompletionListener calls queue_manager.notify_completion(action_id)

-        Ok(())
+        Ok(execution_result.created)
    }
 }

--- a/crates/executor/src/event_processor.rs
+++ b/crates/executor/src/event_processor.rs
@@ -19,7 +19,7 @@ use attune_common::{
        event::{CreateEnforcementInput, EnforcementRepository, EventRepository},
        pack::PackRepository,
        rule::RuleRepository,
-        Create, FindById, List,
+        FindById, List,
    },
    template_resolver::{resolve_templates, TemplateContext},
 };
@@ -206,32 +206,43 @@ impl EventProcessor {
            conditions: rule.conditions.clone(),
        };

-        let enforcement = EnforcementRepository::create(pool, create_input).await?;
+        let enforcement_result =
+            EnforcementRepository::create_or_get_by_rule_event(pool, create_input).await?;
+        let enforcement = enforcement_result.enforcement;

-        info!(
-            "Enforcement {} created for rule {} (event: {})",
-            enforcement.id, rule.r#ref, event.id
-        );
+        if enforcement_result.created {
+            info!(
+                "Enforcement {} created for rule {} (event: {})",
+                enforcement.id, rule.r#ref, event.id
+            );
+        } else {
+            info!(
+                "Reusing enforcement {} for rule {} (event: {})",
+                enforcement.id, rule.r#ref, event.id
+            );
+        }

-        // Publish EnforcementCreated message
-        let enforcement_payload = EnforcementCreatedPayload {
-            enforcement_id: enforcement.id,
-            rule_id: Some(rule.id),
-            rule_ref: rule.r#ref.clone(),
-            event_id: Some(event.id),
-            trigger_ref: event.trigger_ref.clone(),
-            payload: payload.clone(),
-        };
+        if enforcement_result.created || enforcement.status == EnforcementStatus::Created {
+            let enforcement_payload = EnforcementCreatedPayload {
+                enforcement_id: enforcement.id,
+                rule_id: Some(rule.id),
+                rule_ref: rule.r#ref.clone(),
+                event_id: Some(event.id),
+                trigger_ref: event.trigger_ref.clone(),
+                payload: payload.clone(),
+            };

-        let envelope = MessageEnvelope::new(MessageType::EnforcementCreated, enforcement_payload)
-            .with_source("event-processor");
+            let envelope =
+                MessageEnvelope::new(MessageType::EnforcementCreated, enforcement_payload)
+                    .with_source("event-processor");

-        publisher.publish_envelope(&envelope).await?;
+            publisher.publish_envelope(&envelope).await?;

-        debug!(
-            "Published EnforcementCreated message for enforcement {}",
-            enforcement.id
-        );
+            debug!(
+                "Published EnforcementCreated message for enforcement {}",
+                enforcement.id
+            );
+        }

        Ok(())
    }
--- a/crates/executor/src/inquiry_handler.rs
+++ b/crates/executor/src/inquiry_handler.rs
@@ -9,13 +9,14 @@

 use anyhow::Result;
 use attune_common::{
+    error::Error as AttuneError,
    models::{enums::InquiryStatus, inquiry::Inquiry, Execution, Id},
    mq::{
        Consumer, InquiryCreatedPayload, InquiryRespondedPayload, MessageEnvelope, MessageType,
        Publisher,
    },
    repositories::{
-        execution::{ExecutionRepository, UpdateExecutionInput},
+        execution::{ExecutionRepository, UpdateExecutionInput, SELECT_COLUMNS},
        inquiry::{CreateInquiryInput, InquiryRepository},
        Create, FindById, Update,
    },
@@ -28,6 +29,8 @@ use tracing::{debug, error, info, warn};

 /// Special key in action result to indicate an inquiry should be created
 pub const INQUIRY_RESULT_KEY: &str = "__inquiry";
+const INQUIRY_ID_RESULT_KEY: &str = "__inquiry_id";
+const INQUIRY_CREATED_PUBLISHED_RESULT_KEY: &str = "__inquiry_created_published";

 /// Structure for inquiry data in action results
 #[derive(Debug, Clone, serde::Deserialize)]
@@ -104,69 +107,198 @@ impl InquiryHandler {
        let inquiry_request: InquiryRequest = serde_json::from_value(inquiry_value.clone())?;
        Ok(inquiry_request)
    }
+}

+/// Returns true when `e` represents a PostgreSQL unique constraint violation (code 23505).
+fn is_db_unique_violation(e: &AttuneError) -> bool {
+    if let AttuneError::Database(sqlx_err) = e {
+        return sqlx_err
+            .as_database_error()
+            .and_then(|db| db.code())
+            .as_deref()
+            == Some("23505");
+    }
+    false
+}
+
+impl InquiryHandler {
    /// Create an inquiry for an execution and pause it
    pub async fn create_inquiry_from_result(
        pool: &PgPool,
        publisher: &Publisher,
        execution_id: Id,
-        result: &JsonValue,
+        _result: &JsonValue,
    ) -> Result<Inquiry> {
        info!("Creating inquiry for execution {}", execution_id);

-        // Extract inquiry request
-        let inquiry_request = Self::extract_inquiry_request(result)?;
+        let mut tx = pool.begin().await?;
+        let execution = sqlx::query_as::<_, Execution>(&format!(
+            "SELECT {SELECT_COLUMNS} FROM execution WHERE id = $1 FOR UPDATE"
+        ))
+        .bind(execution_id)
+        .fetch_one(&mut *tx)
+        .await?;

-        // Calculate timeout if specified
+        let mut result = execution
+            .result
+            .clone()
+            .ok_or_else(|| anyhow::anyhow!("Execution {} has no result", execution_id))?;
+        let inquiry_request = Self::extract_inquiry_request(&result)?;
        let timeout_at = inquiry_request
            .timeout_seconds
            .map(|seconds| Utc::now() + chrono::Duration::seconds(seconds));

-        // Create inquiry in database
-        let inquiry_input = CreateInquiryInput {
-            execution: execution_id,
-            prompt: inquiry_request.prompt.clone(),
-            response_schema: inquiry_request.response_schema.clone(),
-            assigned_to: inquiry_request.assigned_to,
-            status: InquiryStatus::Pending,
-            response: None,
-            timeout_at,
+        let existing_inquiry_id = result
+            .get(INQUIRY_ID_RESULT_KEY)
+            .and_then(|value| value.as_i64());
+        let published = result
+            .get(INQUIRY_CREATED_PUBLISHED_RESULT_KEY)
+            .and_then(|value| value.as_bool())
+            .unwrap_or(false);
+
+        let (inquiry, should_publish) = if let Some(inquiry_id) = existing_inquiry_id {
+            let inquiry = InquiryRepository::find_by_id(&mut *tx, inquiry_id)
+                .await?
+                .ok_or_else(|| {
+                    anyhow::anyhow!(
+                        "Inquiry {} referenced by execution {} result not found",
+                        inquiry_id,
+                        execution_id
+                    )
+                })?;
+            let should_publish = !published && inquiry.status == InquiryStatus::Pending;
+            (inquiry, should_publish)
+        } else {
+            let create_result = InquiryRepository::create(
+                &mut *tx,
+                CreateInquiryInput {
+                    execution: execution_id,
+                    prompt: inquiry_request.prompt.clone(),
+                    response_schema: inquiry_request.response_schema.clone(),
+                    assigned_to: inquiry_request.assigned_to,
+                    status: InquiryStatus::Pending,
+                    response: None,
+                    timeout_at,
+                },
+            )
+            .await;
+
+            let inquiry = match create_result {
+                Ok(inq) => inq,
+                Err(e) => {
+                    // Unique constraint violation (23505): another replica already
+                    // created the inquiry for this execution. Treat as idempotent
+                    // success — drop the aborted transaction and return the existing row.
+                    if is_db_unique_violation(&e) {
+                        info!(
+                            "Inquiry for execution {} already created by another replica \
+                             (unique constraint 23505); treating as idempotent",
+                            execution_id
+                        );
+                        // tx is in an aborted state; dropping it issues ROLLBACK.
+                        drop(tx);
+                        let inquiries =
+                            InquiryRepository::find_by_execution(pool, execution_id).await?;
+                        let existing = inquiries.into_iter().next().ok_or_else(|| {
+                            anyhow::anyhow!(
+                                "Inquiry for execution {} not found after unique constraint violation",
+                                execution_id
+                            )
+                        })?;
+                        return Ok(existing);
+                    }
+                    return Err(e.into());
+                }
+            };
+
+            Self::set_inquiry_result_metadata(&mut result, inquiry.id, false)?;
+            ExecutionRepository::update(
+                &mut *tx,
+                execution_id,
+                UpdateExecutionInput {
+                    result: Some(result),
+                    ..Default::default()
+                },
+            )
+            .await?;
+
+            (inquiry, true)
        };

-        let inquiry = InquiryRepository::create(pool, inquiry_input).await?;
+        tx.commit().await?;

-        info!(
-            "Created inquiry {} for execution {}",
-            inquiry.id, execution_id
-        );
+        if should_publish {
+            let payload = InquiryCreatedPayload {
+                inquiry_id: inquiry.id,
+                execution_id,
+                prompt: inquiry_request.prompt,
+                response_schema: inquiry_request.response_schema,
+                assigned_to: inquiry_request.assigned_to,
+                timeout_at,
+            };

-        // Update execution status to paused/waiting
-        // Note: We use a special status or keep it as "running" with inquiry tracking
-        // For now, we'll keep status as-is and track via inquiry relationship
+            let envelope =
+                MessageEnvelope::new(MessageType::InquiryCreated, payload).with_source("executor");

-        // Publish InquiryCreated message
-        let payload = InquiryCreatedPayload {
-            inquiry_id: inquiry.id,
-            execution_id,
-            prompt: inquiry_request.prompt,
-            response_schema: inquiry_request.response_schema,
-            assigned_to: inquiry_request.assigned_to,
-            timeout_at,
-        };
+            publisher.publish_envelope(&envelope).await?;
+            Self::mark_inquiry_created_published(pool, execution_id).await?;

-        let envelope =
-            MessageEnvelope::new(MessageType::InquiryCreated, payload).with_source("executor");
-
-        publisher.publish_envelope(&envelope).await?;
-
-        debug!(
-            "Published InquiryCreated message for inquiry {}",
-            inquiry.id
-        );
+            debug!(
+                "Published InquiryCreated message for inquiry {}",
+                inquiry.id
+            );
+        }

        Ok(inquiry)
    }

+    fn set_inquiry_result_metadata(
+        result: &mut JsonValue,
+        inquiry_id: Id,
+        published: bool,
+    ) -> Result<()> {
+        let obj = result
+            .as_object_mut()
+            .ok_or_else(|| anyhow::anyhow!("execution result is not a JSON object"))?;
+
+        obj.insert(
+            INQUIRY_ID_RESULT_KEY.to_string(),
+            JsonValue::Number(inquiry_id.into()),
+        );
+        obj.insert(
+            INQUIRY_CREATED_PUBLISHED_RESULT_KEY.to_string(),
+            JsonValue::Bool(published),
+        );
+        Ok(())
+    }
+
+    async fn mark_inquiry_created_published(pool: &PgPool, execution_id: Id) -> Result<()> {
+        let execution = ExecutionRepository::find_by_id(pool, execution_id)
+            .await?
+            .ok_or_else(|| anyhow::anyhow!("Execution {} not found", execution_id))?;
+        let mut result = execution
+            .result
+            .clone()
+            .ok_or_else(|| anyhow::anyhow!("Execution {} has no result", execution_id))?;
+        let inquiry_id = result
+            .get(INQUIRY_ID_RESULT_KEY)
+            .and_then(|value| value.as_i64())
+            .ok_or_else(|| anyhow::anyhow!("Execution {} missing __inquiry_id", execution_id))?;
+
+        Self::set_inquiry_result_metadata(&mut result, inquiry_id, true)?;
+        ExecutionRepository::update(
+            pool,
+            execution_id,
+            UpdateExecutionInput {
+                result: Some(result),
+                ..Default::default()
+            },
+        )
+        .await?;
+
+        Ok(())
+    }
+
    /// Handle an inquiry response message
    async fn handle_inquiry_response(
        pool: &PgPool,
@@ -235,9 +367,13 @@ impl InquiryHandler {
        if let Some(obj) = updated_result.as_object_mut() {
            obj.insert("__inquiry_response".to_string(), response.clone());
            obj.insert(
-                "__inquiry_id".to_string(),
+                INQUIRY_ID_RESULT_KEY.to_string(),
                JsonValue::Number(inquiry.id.into()),
            );
+            obj.insert(
+                INQUIRY_CREATED_PUBLISHED_RESULT_KEY.to_string(),
+                JsonValue::Bool(true),
+            );
        }

        // Update execution with new result
--- a/crates/executor/src/policy_enforcer.rs
+++ b/crates/executor/src/policy_enforcer.rs
@@ -933,8 +933,8 @@ mod tests {
        assert_eq!(enforcer.get_concurrency_limit(2, Some(200)), Some(20));
    }

-    #[test]
-    fn test_build_parameter_group_key_uses_exact_values() {
+    #[tokio::test]
+    async fn test_build_parameter_group_key_uses_exact_values() {
        let pool = sqlx::PgPool::connect_lazy("postgresql://localhost/test").unwrap();
        let enforcer = PolicyEnforcer::new(pool);
        let config = serde_json::json!({
--- a/crates/executor/src/queue_manager.rs
+++ b/crates/executor/src/queue_manager.rs
@@ -23,7 +23,13 @@ use tokio::time::{sleep, Duration, Instant};
 use tracing::{debug, info, warn};

 use attune_common::models::Id;
-use attune_common::repositories::queue_stats::{QueueStatsRepository, UpsertQueueStatsInput};
+use attune_common::repositories::{
+    execution_admission::{
+        AdmissionEnqueueOutcome, AdmissionQueueStats, AdmissionQueuedRemovalOutcome,
+        AdmissionSlotReleaseOutcome, ExecutionAdmissionRepository,
+    },
+    queue_stats::{QueueStatsRepository, UpsertQueueStatsInput},
+};

 /// Configuration for the queue manager
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -51,6 +57,8 @@ impl Default for QueueConfig {
 struct QueueEntry {
    /// Execution or enforcement ID being queued
    execution_id: Id,
+    /// Durable FIFO position for the DB-backed admission path.
+    queue_order: Option<i64>,
    /// When this entry was added to the queue
    enqueued_at: DateTime<Utc>,
 }
@@ -224,6 +232,12 @@ impl ExecutionQueueManager {
        max_concurrent: u32,
        group_key: Option<String>,
    ) -> Result<()> {
+        if self.db_pool.is_some() {
+            return self
+                .enqueue_and_wait_db(action_id, execution_id, max_concurrent, group_key)
+                .await;
+        }
+
        if self.active_execution_keys.contains_key(&execution_id) {
            debug!(
                "Execution {} already owns an active slot, skipping queue wait",
@@ -311,6 +325,7 @@ impl ExecutionQueueManager {
            // Add to queue
            let entry = QueueEntry {
                execution_id,
+                queue_order: None,
                enqueued_at: Utc::now(),
            };

@@ -392,6 +407,24 @@ impl ExecutionQueueManager {
        max_concurrent: u32,
        group_key: Option<String>,
    ) -> Result<SlotEnqueueOutcome> {
+        if let Some(pool) = &self.db_pool {
+            return Ok(
+                match ExecutionAdmissionRepository::enqueue(
+                    pool,
+                    self.config.max_queue_length,
+                    action_id,
+                    execution_id,
+                    max_concurrent,
+                    group_key,
+                )
+                .await?
+                {
+                    AdmissionEnqueueOutcome::Acquired => SlotEnqueueOutcome::Acquired,
+                    AdmissionEnqueueOutcome::Enqueued => SlotEnqueueOutcome::Enqueued,
+                },
+            );
+        }
+
        if self.active_execution_keys.contains_key(&execution_id) {
            debug!(
                "Execution {} already owns an active slot, treating as acquired",
@@ -463,6 +496,7 @@ impl ExecutionQueueManager {

            queue.queue.push_back(QueueEntry {
                execution_id,
+                queue_order: None,
                enqueued_at: Utc::now(),
            });
            queue.total_enqueued += 1;
@@ -480,6 +514,21 @@ impl ExecutionQueueManager {
        max_concurrent: u32,
        group_key: Option<String>,
    ) -> Result<SlotAcquireOutcome> {
+        if let Some(pool) = &self.db_pool {
+            let outcome = ExecutionAdmissionRepository::try_acquire(
+                pool,
+                action_id,
+                execution_id,
+                max_concurrent,
+                group_key,
+            )
+            .await?;
+            return Ok(SlotAcquireOutcome {
+                acquired: outcome.acquired,
+                current_count: outcome.current_count,
+            });
+        }
+
        let queue_key = self.queue_key(action_id, group_key);
        let queue_arc = self
            .get_or_create_queue(queue_key.clone(), max_concurrent)
@@ -530,6 +579,14 @@ impl ExecutionQueueManager {
        &self,
        execution_id: Id,
    ) -> Result<Option<SlotReleaseOutcome>> {
+        if let Some(pool) = &self.db_pool {
+            return Ok(
+                ExecutionAdmissionRepository::release_active_slot(pool, execution_id)
+                    .await?
+                    .map(Self::map_release_outcome),
+            );
+        }
+
        let Some((_, queue_key)) = self.active_execution_keys.remove(&execution_id) else {
            debug!(
                "No active queue slot found for execution {} (queue may have been cleared)",
@@ -610,6 +667,16 @@ impl ExecutionQueueManager {
        execution_id: Id,
        outcome: &SlotReleaseOutcome,
    ) -> Result<()> {
+        if let Some(pool) = &self.db_pool {
+            ExecutionAdmissionRepository::restore_active_slot(
+                pool,
+                execution_id,
+                &Self::to_admission_release_outcome(outcome),
+            )
+            .await?;
+            return Ok(());
+        }
+
        let action_id = outcome.queue_key.action_id;
        let queue_arc = self.get_or_create_queue(outcome.queue_key.clone(), 1).await;
        let mut queue = queue_arc.lock().await;
@@ -630,6 +697,14 @@ impl ExecutionQueueManager {
        &self,
        execution_id: Id,
    ) -> Result<Option<QueuedRemovalOutcome>> {
+        if let Some(pool) = &self.db_pool {
+            return Ok(
+                ExecutionAdmissionRepository::remove_queued_execution(pool, execution_id)
+                    .await?
+                    .map(Self::map_removal_outcome),
+            );
+        }
+
        for entry in self.queues.iter() {
            let queue_key = entry.key().clone();
            let queue_arc = entry.value().clone();
@@ -666,6 +741,15 @@ impl ExecutionQueueManager {
    }

    pub async fn restore_queued_execution(&self, outcome: &QueuedRemovalOutcome) -> Result<()> {
+        if let Some(pool) = &self.db_pool {
+            ExecutionAdmissionRepository::restore_queued_execution(
+                pool,
+                &Self::to_admission_removal_outcome(outcome),
+            )
+            .await?;
+            return Ok(());
+        }
+
        let action_id = outcome.queue_key.action_id;
        let queue_arc = self.get_or_create_queue(outcome.queue_key.clone(), 1).await;
        let mut queue = queue_arc.lock().await;
@@ -709,6 +793,19 @@ impl ExecutionQueueManager {

    /// Get statistics for a specific action's queue
    pub async fn get_queue_stats(&self, action_id: Id) -> Option<QueueStats> {
+        if let Some(pool) = &self.db_pool {
+            return ExecutionAdmissionRepository::get_queue_stats(pool, action_id)
+                .await
+                .map(|stats| stats.map(Self::map_queue_stats))
+                .unwrap_or_else(|err| {
+                    warn!(
+                        "Failed to load shared queue stats for action {}: {}",
+                        action_id, err
+                    );
+                    None
+                });
+        }
+
        let queue_arcs: Vec<Arc<Mutex<ActionQueue>>> = self
            .queues
            .iter()
@@ -757,6 +854,26 @@ impl ExecutionQueueManager {
    /// Get statistics for all queues
    #[allow(dead_code)]
    pub async fn get_all_queue_stats(&self) -> Vec<QueueStats> {
+        if let Some(pool) = &self.db_pool {
+            return QueueStatsRepository::list_all(pool)
+                .await
+                .map(|stats| {
+                    stats
+                        .into_iter()
+                        .map(|stat| QueueStats {
+                            action_id: stat.action_id,
+                            queue_length: stat.queue_length as usize,
+                            active_count: stat.active_count as u32,
+                            max_concurrent: stat.max_concurrent as u32,
+                            oldest_enqueued_at: stat.oldest_enqueued_at,
+                            total_enqueued: stat.total_enqueued as u64,
+                            total_completed: stat.total_completed as u64,
+                        })
+                        .collect()
+                })
+                .unwrap_or_default();
+        }
+
        let mut stats = Vec::new();

        let mut action_ids = std::collections::BTreeSet::new();
@@ -787,6 +904,14 @@ impl ExecutionQueueManager {
    /// * `Ok(false)` - Execution not found in queue
    #[allow(dead_code)]
    pub async fn cancel_execution(&self, action_id: Id, execution_id: Id) -> Result<bool> {
+        if let Some(pool) = &self.db_pool {
+            return Ok(
+                ExecutionAdmissionRepository::remove_queued_execution(pool, execution_id)
+                    .await?
+                    .is_some(),
+            );
+        }
+
        debug!(
            "Attempting to cancel execution {} for action {}",
            execution_id, action_id
@@ -838,12 +963,147 @@ impl ExecutionQueueManager {
    /// Get the number of actions with active queues
    #[allow(dead_code)]
    pub fn active_queue_count(&self) -> usize {
+        if self.db_pool.is_some() {
+            return 0;
+        }
+
        self.queues
            .iter()
            .map(|entry| entry.key().action_id)
            .collect::<std::collections::BTreeSet<_>>()
            .len()
    }
+
+    async fn enqueue_and_wait_db(
+        &self,
+        action_id: Id,
+        execution_id: Id,
+        max_concurrent: u32,
+        group_key: Option<String>,
+    ) -> Result<()> {
+        let pool = self
+            .db_pool
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("database pool required for shared admission"))?;
+
+        match ExecutionAdmissionRepository::enqueue(
+            pool,
+            self.config.max_queue_length,
+            action_id,
+            execution_id,
+            max_concurrent,
+            group_key.clone(),
+        )
+        .await?
+        {
+            AdmissionEnqueueOutcome::Acquired => return Ok(()),
+            AdmissionEnqueueOutcome::Enqueued => {}
+        }
+
+        let deadline = Instant::now() + Duration::from_secs(self.config.queue_timeout_seconds);
+        loop {
+            sleep(Duration::from_millis(10)).await;
+
+            match ExecutionAdmissionRepository::wait_status(pool, execution_id).await? {
+                Some(true) => return Ok(()),
+                Some(false) => {}
+                None => {
+                    return Err(anyhow::anyhow!(
+                        "Queue state for execution {} disappeared while waiting",
+                        execution_id
+                    ));
+                }
+            }
+
+            if Instant::now() < deadline {
+                continue;
+            }
+
+            match ExecutionAdmissionRepository::remove_queued_execution(pool, execution_id).await? {
+                Some(_) => {
+                    return Err(anyhow::anyhow!(
+                        "Queue timeout for execution {}: waited {} seconds",
+                        execution_id,
+                        self.config.queue_timeout_seconds
+                    ));
+                }
+                None => {
+                    if matches!(
+                        ExecutionAdmissionRepository::wait_status(pool, execution_id).await?,
+                        Some(true)
+                    ) {
+                        return Ok(());
+                    }
+
+                    return Err(anyhow::anyhow!(
+                        "Queue timeout for execution {}: waited {} seconds",
+                        execution_id,
+                        self.config.queue_timeout_seconds
+                    ));
+                }
+            }
+        }
+    }
+
+    fn map_release_outcome(outcome: AdmissionSlotReleaseOutcome) -> SlotReleaseOutcome {
+        SlotReleaseOutcome {
+            next_execution_id: outcome.next_execution_id,
+            queue_key: QueueKey {
+                action_id: outcome.action_id,
+                group_key: outcome.group_key,
+            },
+        }
+    }
+
+    fn to_admission_release_outcome(outcome: &SlotReleaseOutcome) -> AdmissionSlotReleaseOutcome {
+        AdmissionSlotReleaseOutcome {
+            action_id: outcome.queue_key.action_id,
+            group_key: outcome.queue_key.group_key.clone(),
+            next_execution_id: outcome.next_execution_id,
+        }
+    }
+
+    fn map_removal_outcome(outcome: AdmissionQueuedRemovalOutcome) -> QueuedRemovalOutcome {
+        QueuedRemovalOutcome {
+            next_execution_id: outcome.next_execution_id,
+            queue_key: QueueKey {
+                action_id: outcome.action_id,
+                group_key: outcome.group_key,
+            },
+            removed_entry: QueueEntry {
+                execution_id: outcome.execution_id,
+                queue_order: Some(outcome.queue_order),
+                enqueued_at: outcome.enqueued_at,
+            },
+            removed_index: outcome.removed_index,
+        }
+    }
+
+    fn to_admission_removal_outcome(
+        outcome: &QueuedRemovalOutcome,
+    ) -> AdmissionQueuedRemovalOutcome {
+        AdmissionQueuedRemovalOutcome {
+            action_id: outcome.queue_key.action_id,
+            group_key: outcome.queue_key.group_key.clone(),
+            next_execution_id: outcome.next_execution_id,
+            execution_id: outcome.removed_entry.execution_id,
+            queue_order: outcome.removed_entry.queue_order.unwrap_or_default(),
+            enqueued_at: outcome.removed_entry.enqueued_at,
+            removed_index: outcome.removed_index,
+        }
+    }
+
+    fn map_queue_stats(stats: AdmissionQueueStats) -> QueueStats {
+        QueueStats {
+            action_id: stats.action_id,
+            queue_length: stats.queue_length,
+            active_count: stats.active_count,
+            max_concurrent: stats.max_concurrent,
+            oldest_enqueued_at: stats.oldest_enqueued_at,
+            total_enqueued: stats.total_enqueued,
+            total_completed: stats.total_completed,
+        }
+    }
 }

 #[cfg(test)]
--- a/crates/executor/src/scheduler.rs
+++ b/crates/executor/src/scheduler.rs
@@ -25,12 +25,12 @@ use attune_common::{
        workflow::{
            CreateWorkflowExecutionInput, WorkflowDefinitionRepository, WorkflowExecutionRepository,
        },
-        Create, FindById, FindByRef, Update,
+        FindById, FindByRef, Update,
    },
    runtime_detection::runtime_aliases_contain,
    workflow::WorkflowDefinition,
 };
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use serde_json::Value as JsonValue;
 use sqlx::{PgConnection, PgPool};
@@ -102,6 +102,17 @@ struct ExecutionScheduledPayload {
    worker_id: i64,
    action_ref: String,
    config: Option<JsonValue>,
+    scheduled_attempt_updated_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone)]
+struct PendingExecutionRequested {
+    execution_id: i64,
+    action_id: i64,
+    action_ref: String,
+    parent_id: i64,
+    enforcement_id: Option<i64>,
+    config: Option<JsonValue>,
 }

 /// Execution scheduler that routes executions to workers
@@ -509,6 +520,7 @@ impl ExecutionScheduler {
            &worker.id,
            &envelope.payload.action_ref,
            &execution_config,
+            scheduled_execution.updated,
            &action,
        )
        .await
@@ -1021,13 +1033,13 @@ impl ExecutionScheduler {
    #[allow(clippy::too_many_arguments)]
    async fn dispatch_workflow_task_with_conn(
        conn: &mut PgConnection,
-        publisher: &Publisher,
        _round_robin_counter: &AtomicUsize,
        parent_execution: &Execution,
        workflow_execution_id: &i64,
        task_node: &crate::workflow::graph::TaskNode,
        wf_ctx: &WorkflowContext,
        triggered_by: Option<&str>,
+        pending_messages: &mut Vec<PendingExecutionRequested>,
    ) -> Result<()> {
        let action_ref: String = match &task_node.action {
            Some(a) => a.clone(),
@@ -1059,7 +1071,6 @@ impl ExecutionScheduler {
        if let Some(ref with_items_expr) = task_node.with_items {
            return Self::dispatch_with_items_task_with_conn(
                conn,
-                publisher,
                parent_execution,
                workflow_execution_id,
                task_node,
@@ -1068,6 +1079,7 @@ impl ExecutionScheduler {
                with_items_expr,
                wf_ctx,
                triggered_by,
+                pending_messages,
            )
            .await;
        }
@@ -1115,36 +1127,29 @@ impl ExecutionScheduler {
            completed_at: None,
        };

-        let child_execution = if let Some(existing) = ExecutionRepository::find_by_workflow_task(
+        let child_execution_result = ExecutionRepository::create_workflow_task_if_absent_with_conn(
            &mut *conn,
+            CreateExecutionInput {
+                action: Some(task_action.id),
+                action_ref: action_ref.clone(),
+                config: task_config,
+                env_vars: parent_execution.env_vars.clone(),
+                parent: Some(parent_execution.id),
+                enforcement: parent_execution.enforcement,
+                executor: None,
+                worker: None,
+                status: ExecutionStatus::Requested,
+                result: None,
+                workflow_task: Some(workflow_task),
+            },
            *workflow_execution_id,
            &task_node.name,
            None,
        )
-        .await?
-        {
-            existing
-        } else {
-            ExecutionRepository::create(
-                &mut *conn,
-                CreateExecutionInput {
-                    action: Some(task_action.id),
-                    action_ref: action_ref.clone(),
-                    config: task_config,
-                    env_vars: parent_execution.env_vars.clone(),
-                    parent: Some(parent_execution.id),
-                    enforcement: parent_execution.enforcement,
-                    executor: None,
-                    worker: None,
-                    status: ExecutionStatus::Requested,
-                    result: None,
-                    workflow_task: Some(workflow_task),
-                },
-            )
-            .await?
-        };
+        .await?;
+        let child_execution = child_execution_result.execution;

-        if child_execution.status == ExecutionStatus::Requested {
+        if child_execution_result.created {
            info!(
                "Created child execution {} for workflow task '{}' (action '{}', workflow_execution {})",
                child_execution.id, task_node.name, action_ref, workflow_execution_id
@@ -1157,24 +1162,14 @@ impl ExecutionScheduler {
        }

        if child_execution.status == ExecutionStatus::Requested {
-            let payload = ExecutionRequestedPayload {
+            pending_messages.push(PendingExecutionRequested {
                execution_id: child_execution.id,
-                action_id: Some(task_action.id),
+                action_id: task_action.id,
                action_ref: action_ref.clone(),
-                parent_id: Some(parent_execution.id),
+                parent_id: parent_execution.id,
                enforcement_id: parent_execution.enforcement,
                config: child_execution.config.clone(),
-            };
-
-            let envelope = MessageEnvelope::new(MessageType::ExecutionRequested, payload)
-                .with_source("executor-scheduler");
-
-            publisher.publish_envelope(&envelope).await?;
-
-            info!(
-                "Published ExecutionRequested for child execution {} (task '{}')",
-                child_execution.id, task_node.name
-            );
+            });
        }

        Ok(())
@@ -1392,7 +1387,6 @@ impl ExecutionScheduler {
    #[allow(clippy::too_many_arguments)]
    async fn dispatch_with_items_task_with_conn(
        conn: &mut PgConnection,
-        publisher: &Publisher,
        parent_execution: &Execution,
        workflow_execution_id: &i64,
        task_node: &crate::workflow::graph::TaskNode,
@@ -1401,6 +1395,7 @@ impl ExecutionScheduler {
        with_items_expr: &str,
        wf_ctx: &WorkflowContext,
        triggered_by: Option<&str>,
+        pending_messages: &mut Vec<PendingExecutionRequested>,
    ) -> Result<()> {
        let items_value = wf_ctx
            .render_json(&JsonValue::String(with_items_expr.to_string()))
@@ -1511,18 +1506,8 @@ impl ExecutionScheduler {
                completed_at: None,
            };

-            let child_execution = if let Some(existing) =
-                ExecutionRepository::find_by_workflow_task(
-                    &mut *conn,
-                    *workflow_execution_id,
-                    &task_node.name,
-                    Some(index as i32),
-                )
-                .await?
-            {
-                existing
-            } else {
-                ExecutionRepository::create(
+            let child_execution_result =
+                ExecutionRepository::create_workflow_task_if_absent_with_conn(
                    &mut *conn,
                    CreateExecutionInput {
                        action: Some(task_action.id),
@@ -1537,11 +1522,14 @@ impl ExecutionScheduler {
                        result: None,
                        workflow_task: Some(workflow_task),
                    },
+                    *workflow_execution_id,
+                    &task_node.name,
+                    Some(index as i32),
                )
-                .await?
-            };
+                .await?;
+            let child_execution = child_execution_result.execution;

-            if child_execution.status == ExecutionStatus::Requested {
+            if child_execution_result.created {
                info!(
                    "Created with_items child execution {} for task '{}' item {} \
                     (action '{}', workflow_execution {})",
@@ -1566,11 +1554,11 @@ impl ExecutionScheduler {
            if child.status == ExecutionStatus::Requested {
                Self::publish_execution_requested_with_conn(
                    &mut *conn,
-                    publisher,
                    child_id,
                    task_action.id,
                    action_ref,
                    parent_execution,
+                    pending_messages,
                )
                .await?;
            }
@@ -1622,25 +1610,17 @@ impl ExecutionScheduler {
        Ok(())
    }

-    async fn publish_execution_requested_with_conn(
-        conn: &mut PgConnection,
+    async fn publish_execution_requested_payload(
        publisher: &Publisher,
-        execution_id: i64,
-        action_id: i64,
-        action_ref: &str,
-        parent_execution: &Execution,
+        pending: PendingExecutionRequested,
    ) -> Result<()> {
-        let child = ExecutionRepository::find_by_id(&mut *conn, execution_id)
-            .await?
-            .ok_or_else(|| anyhow::anyhow!("Execution {} not found", execution_id))?;
-
        let payload = ExecutionRequestedPayload {
-            execution_id: child.id,
-            action_id: Some(action_id),
-            action_ref: action_ref.to_string(),
-            parent_id: Some(parent_execution.id),
-            enforcement_id: parent_execution.enforcement,
-            config: child.config.clone(),
+            execution_id: pending.execution_id,
+            action_id: Some(pending.action_id),
+            action_ref: pending.action_ref,
+            parent_id: Some(pending.parent_id),
+            enforcement_id: pending.enforcement_id,
+            config: pending.config,
        };

        let envelope = MessageEnvelope::new(MessageType::ExecutionRequested, payload)
@@ -1650,12 +1630,36 @@ impl ExecutionScheduler {

        debug!(
            "Published deferred ExecutionRequested for child execution {}",
-            execution_id
+            envelope.payload.execution_id
        );

        Ok(())
    }

+    async fn publish_execution_requested_with_conn(
+        conn: &mut PgConnection,
+        execution_id: i64,
+        action_id: i64,
+        action_ref: &str,
+        parent_execution: &Execution,
+        pending_messages: &mut Vec<PendingExecutionRequested>,
+    ) -> Result<()> {
+        let child = ExecutionRepository::find_by_id(&mut *conn, execution_id)
+            .await?
+            .ok_or_else(|| anyhow::anyhow!("Execution {} not found", execution_id))?;
+
+        pending_messages.push(PendingExecutionRequested {
+            execution_id: child.id,
+            action_id,
+            action_ref: action_ref.to_string(),
+            parent_id: parent_execution.id,
+            enforcement_id: parent_execution.enforcement,
+            config: child.config.clone(),
+        });
+
+        Ok(())
+    }
+
    /// Publish the next `Requested`-status with_items siblings to fill freed
    /// concurrency slots.
    ///
@@ -1734,11 +1738,11 @@ impl ExecutionScheduler {

    async fn publish_pending_with_items_children_with_conn(
        conn: &mut PgConnection,
-        publisher: &Publisher,
        parent_execution: &Execution,
        workflow_execution_id: i64,
        task_name: &str,
        slots: usize,
+        pending_messages: &mut Vec<PendingExecutionRequested>,
    ) -> Result<usize> {
        if slots == 0 {
            return Ok(0);
@@ -1768,11 +1772,11 @@ impl ExecutionScheduler {

            if let Err(e) = Self::publish_execution_requested_with_conn(
                &mut *conn,
-                publisher,
                *child_id,
                *action_id,
                &child.action_ref,
                parent_execution,
+                pending_messages,
            )
            .await
            {
@@ -1819,12 +1823,35 @@ impl ExecutionScheduler {
            .execute(&mut *lock_conn)
            .await?;

-        let result = Self::advance_workflow_serialized(
-            &mut lock_conn,
-            publisher,
-            round_robin_counter,
-            execution,
-        )
+        let result = async {
+            sqlx::query("BEGIN").execute(&mut *lock_conn).await?;
+
+            let advance_result =
+                Self::advance_workflow_serialized(&mut lock_conn, round_robin_counter, execution)
+                    .await;
+
+            match advance_result {
+                Ok(pending_messages) => {
+                    sqlx::query("COMMIT").execute(&mut *lock_conn).await?;
+
+                    for pending in pending_messages {
+                        Self::publish_execution_requested_payload(publisher, pending).await?;
+                    }
+
+                    Ok(())
+                }
+                Err(err) => {
+                    let rollback_result = sqlx::query("ROLLBACK").execute(&mut *lock_conn).await;
+                    if let Err(rollback_err) = rollback_result {
+                        error!(
+                            "Failed to roll back workflow_execution {} advancement transaction: {}",
+                            workflow_execution_id, rollback_err
+                        );
+                    }
+                    Err(err)
+                }
+            }
+        }
        .await;
        let unlock_result = sqlx::query("SELECT pg_advisory_unlock($1)")
            .bind(workflow_execution_id)
@@ -1838,13 +1865,12 @@ impl ExecutionScheduler {

    async fn advance_workflow_serialized(
        conn: &mut PgConnection,
-        publisher: &Publisher,
        round_robin_counter: &AtomicUsize,
        execution: &Execution,
-    ) -> Result<()> {
+    ) -> Result<Vec<PendingExecutionRequested>> {
        let workflow_task = match &execution.workflow_task {
            Some(wt) => wt,
-            None => return Ok(()), // Not a workflow task, nothing to do
+            None => return Ok(vec![]), // Not a workflow task, nothing to do
        };

        let workflow_execution_id = workflow_task.workflow_execution;
@@ -1867,7 +1893,7 @@ impl ExecutionScheduler {

        // Load the workflow execution record
        let workflow_execution =
-            WorkflowExecutionRepository::find_by_id(&mut *conn, workflow_execution_id)
+            WorkflowExecutionRepository::find_by_id_for_update(&mut *conn, workflow_execution_id)
                .await?
                .ok_or_else(|| {
                    anyhow::anyhow!("Workflow execution {} not found", workflow_execution_id)
@@ -1882,9 +1908,11 @@ impl ExecutionScheduler {
                "Workflow execution {} already in terminal state {:?}, skipping advance",
                workflow_execution_id, workflow_execution.status
            );
-            return Ok(());
+            return Ok(vec![]);
        }

+        let mut pending_messages = Vec::new();
+
        let parent_execution =
            ExecutionRepository::find_by_id(&mut *conn, workflow_execution.execution)
                .await?
@@ -1944,7 +1972,7 @@ impl ExecutionScheduler {
                );
            }

-            return Ok(());
+            return Ok(pending_messages);
        }

        // Load the workflow definition so we can apply param_schema defaults
@@ -2021,11 +2049,11 @@ impl ExecutionScheduler {
            if free_slots > 0 {
                if let Err(e) = Self::publish_pending_with_items_children_with_conn(
                    &mut *conn,
-                    publisher,
                    &parent_for_pending,
                    workflow_execution_id,
                    task_name,
                    free_slots,
+                    &mut pending_messages,
                )
                .await
                {
@@ -2060,7 +2088,7 @@ impl ExecutionScheduler {
                    workflow_task.task_index.unwrap_or(-1),
                    siblings_remaining.len(),
                );
-                return Ok(());
+                return Ok(pending_messages);
            }

            // ---------------------------------------------------------
@@ -2093,7 +2121,7 @@ impl ExecutionScheduler {
                     another advance_workflow call already handled final completion, skipping",
                    task_name,
                );
-                return Ok(());
+                return Ok(pending_messages);
            }

            // All items done — check if any failed
@@ -2280,13 +2308,13 @@ impl ExecutionScheduler {
            if let Some(task_node) = graph.get_task(next_task_name) {
                if let Err(e) = Self::dispatch_workflow_task_with_conn(
                    &mut *conn,
-                    publisher,
                    round_robin_counter,
                    &parent_execution,
                    &workflow_execution_id,
                    task_node,
                    &wf_ctx,
                    Some(task_name), // predecessor that triggered this task
+                    &mut pending_messages,
                )
                .await
                {
@@ -2349,7 +2377,7 @@ impl ExecutionScheduler {
            .await?;
        }

-        Ok(())
+        Ok(pending_messages)
    }

    /// Count child executions that are still in progress for a workflow.
@@ -3139,6 +3167,7 @@ impl ExecutionScheduler {
        worker_id: &i64,
        action_ref: &str,
        config: &Option<JsonValue>,
+        scheduled_attempt_updated_at: DateTime<Utc>,
        _action: &Action,
    ) -> Result<()> {
        debug!("Queuing execution {} to worker {}", execution_id, worker_id);
@@ -3149,6 +3178,7 @@ impl ExecutionScheduler {
            worker_id: *worker_id,
            action_ref: action_ref.to_string(),
            config: config.clone(),
+            scheduled_attempt_updated_at,
        };

        let envelope =
--- a/crates/executor/src/timeout_monitor.rs
+++ b/crates/executor/src/timeout_monitor.rs
@@ -12,7 +12,10 @@ use anyhow::Result;
 use attune_common::{
    models::{enums::ExecutionStatus, Execution},
    mq::{MessageEnvelope, MessageType, Publisher},
-    repositories::execution::SELECT_COLUMNS as EXECUTION_COLUMNS,
+    repositories::{
+        execution::{UpdateExecutionInput, SELECT_COLUMNS as EXECUTION_COLUMNS},
+        ExecutionRepository,
+    },
 };
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
@@ -178,20 +181,27 @@ impl ExecutionTimeoutMonitor {
            "original_status": "scheduled"
        });

-        // Update execution status in database
-        sqlx::query(
-            "UPDATE execution
-             SET status = $1,
-                 result = $2,
-                 updated = NOW()
-             WHERE id = $3",
+        let updated = ExecutionRepository::update_if_status_and_updated_before(
+            &self.pool,
+            execution_id,
+            ExecutionStatus::Scheduled,
+            self.calculate_cutoff_time(),
+            UpdateExecutionInput {
+                status: Some(ExecutionStatus::Failed),
+                result: Some(result.clone()),
+                ..Default::default()
+            },
        )
-        .bind(ExecutionStatus::Failed)
-        .bind(&result)
-        .bind(execution_id)
-        .execute(&self.pool)
        .await?;

+        if updated.is_none() {
+            debug!(
+                "Skipping timeout failure for execution {} because it already left Scheduled or is no longer stale",
+                execution_id
+            );
+            return Ok(());
+        }
+
        info!("Execution {} marked as failed in database", execution_id);

        // Publish completion notification
--- a/crates/executor/tests/fifo_ordering_integration_test.rs
+++ b/crates/executor/tests/fifo_ordering_integration_test.rs
@@ -912,6 +912,115 @@ async fn test_queue_stats_persistence() {
    cleanup_test_data(&pool, pack_id).await;
 }

+#[tokio::test]
+#[ignore] // Requires database
+async fn test_release_restore_recovers_active_slot_and_next_queue_head() {
+    let pool = setup_db().await;
+    let timestamp = Utc::now().timestamp();
+    let suffix = format!("restore_release_{}", timestamp);
+
+    let pack_id = create_test_pack(&pool, &suffix).await;
+    let pack_ref = format!("fifo_test_pack_{}", suffix);
+    let action_id = create_test_action(&pool, pack_id, &pack_ref, &suffix).await;
+    let action_ref = format!("fifo_test_action_{}", suffix);
+
+    let manager = ExecutionQueueManager::with_db_pool(QueueConfig::default(), pool.clone());
+
+    let first =
+        create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
+    let second =
+        create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
+    let third =
+        create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
+
+    manager.enqueue(action_id, first, 1, None).await.unwrap();
+    manager.enqueue(action_id, second, 1, None).await.unwrap();
+    manager.enqueue(action_id, third, 1, None).await.unwrap();
+
+    let stats = manager.get_queue_stats(action_id).await.unwrap();
+    assert_eq!(stats.active_count, 1);
+    assert_eq!(stats.queue_length, 2);
+
+    let release = manager
+        .release_active_slot(first)
+        .await
+        .unwrap()
+        .expect("first execution should own an active slot");
+    assert_eq!(release.next_execution_id, Some(second));
+
+    let stats = manager.get_queue_stats(action_id).await.unwrap();
+    assert_eq!(stats.active_count, 1);
+    assert_eq!(stats.queue_length, 1);
+
+    manager.restore_active_slot(first, &release).await.unwrap();
+
+    let stats = manager.get_queue_stats(action_id).await.unwrap();
+    assert_eq!(stats.active_count, 1);
+    assert_eq!(stats.queue_length, 2);
+    assert_eq!(stats.total_completed, 0);
+
+    let next = manager
+        .release_active_slot(first)
+        .await
+        .unwrap()
+        .expect("restored execution should still own the active slot");
+    assert_eq!(next.next_execution_id, Some(second));
+
+    cleanup_test_data(&pool, pack_id).await;
+}
+
+#[tokio::test]
+#[ignore] // Requires database
+async fn test_remove_restore_recovers_queued_execution_position() {
+    let pool = setup_db().await;
+    let timestamp = Utc::now().timestamp();
+    let suffix = format!("restore_queue_{}", timestamp);
+
+    let pack_id = create_test_pack(&pool, &suffix).await;
+    let pack_ref = format!("fifo_test_pack_{}", suffix);
+    let action_id = create_test_action(&pool, pack_id, &pack_ref, &suffix).await;
+    let action_ref = format!("fifo_test_action_{}", suffix);
+
+    let manager = ExecutionQueueManager::with_db_pool(QueueConfig::default(), pool.clone());
+
+    let first =
+        create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
+    let second =
+        create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
+    let third =
+        create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
+
+    manager.enqueue(action_id, first, 1, None).await.unwrap();
+    manager.enqueue(action_id, second, 1, None).await.unwrap();
+    manager.enqueue(action_id, third, 1, None).await.unwrap();
+
+    let removal = manager
+        .remove_queued_execution(second)
+        .await
+        .unwrap()
+        .expect("second execution should be queued");
+    assert_eq!(removal.next_execution_id, None);
+
+    let stats = manager.get_queue_stats(action_id).await.unwrap();
+    assert_eq!(stats.active_count, 1);
+    assert_eq!(stats.queue_length, 1);
+
+    manager.restore_queued_execution(&removal).await.unwrap();
+
+    let stats = manager.get_queue_stats(action_id).await.unwrap();
+    assert_eq!(stats.active_count, 1);
+    assert_eq!(stats.queue_length, 2);
+
+    let release = manager
+        .release_active_slot(first)
+        .await
+        .unwrap()
+        .expect("first execution should own the active slot");
+    assert_eq!(release.next_execution_id, Some(second));
+
+    cleanup_test_data(&pool, pack_id).await;
+}
+
 #[tokio::test]
 #[ignore] // Requires database
 async fn test_queue_full_rejection() {