ha executor
Some checks failed
CI / Rustfmt (pull_request) Successful in 19s
CI / Cargo Audit & Deny (pull_request) Successful in 33s
CI / Security Blocking Checks (pull_request) Successful in 5s
CI / Web Blocking Checks (pull_request) Successful in 49s
CI / Web Advisory Checks (pull_request) Successful in 33s
CI / Clippy (pull_request) Has been cancelled
CI / Security Advisory Checks (pull_request) Has been cancelled
CI / Tests (pull_request) Has been cancelled
Some checks failed
CI / Rustfmt (pull_request) Successful in 19s
CI / Cargo Audit & Deny (pull_request) Successful in 33s
CI / Security Blocking Checks (pull_request) Successful in 5s
CI / Web Blocking Checks (pull_request) Successful in 49s
CI / Web Advisory Checks (pull_request) Successful in 33s
CI / Clippy (pull_request) Has been cancelled
CI / Security Advisory Checks (pull_request) Has been cancelled
CI / Tests (pull_request) Has been cancelled
This commit is contained in:
@@ -157,7 +157,11 @@ impl CompletionListener {
|
||||
"Failed to advance workflow for execution {}: {}",
|
||||
execution_id, e
|
||||
);
|
||||
// Continue processing — don't fail the entire completion
|
||||
if let Some(mq_err) = Self::retryable_mq_error(&e) {
|
||||
return Err(mq_err.into());
|
||||
}
|
||||
// Non-retryable workflow advancement errors are logged but
|
||||
// do not fail the entire completion processing path.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ use attune_common::{
|
||||
error::Error,
|
||||
models::ExecutionStatus,
|
||||
mq::{Consumer, ConsumerConfig, MessageEnvelope, MessageType, MqResult},
|
||||
repositories::{execution::UpdateExecutionInput, ExecutionRepository, FindById, Update},
|
||||
repositories::{execution::UpdateExecutionInput, ExecutionRepository, FindById},
|
||||
};
|
||||
use chrono::Utc;
|
||||
use serde_json::json;
|
||||
@@ -179,13 +179,12 @@ async fn handle_execution_requested(
|
||||
}
|
||||
};
|
||||
|
||||
// Only fail if still in a non-terminal state
|
||||
if !matches!(
|
||||
execution.status,
|
||||
ExecutionStatus::Scheduled | ExecutionStatus::Running
|
||||
) {
|
||||
// Only scheduled executions are still legitimately owned by the scheduler.
|
||||
// If the execution already moved to running or a terminal state, this DLQ
|
||||
// delivery is stale and must not overwrite newer state.
|
||||
if execution.status != ExecutionStatus::Scheduled {
|
||||
info!(
|
||||
"Execution {} already in terminal state {:?}, skipping",
|
||||
"Execution {} already left Scheduled state ({:?}), skipping stale DLQ handling",
|
||||
execution_id, execution.status
|
||||
);
|
||||
return Ok(()); // Acknowledge to remove from queue
|
||||
@@ -193,6 +192,12 @@ async fn handle_execution_requested(
|
||||
|
||||
// Get worker info from payload for better error message
|
||||
let worker_id = envelope.payload.get("worker_id").and_then(|v| v.as_i64());
|
||||
let scheduled_attempt_updated_at = envelope
|
||||
.payload
|
||||
.get("scheduled_attempt_updated_at")
|
||||
.and_then(|v| v.as_str())
|
||||
.and_then(|s| chrono::DateTime::parse_from_rfc3339(s).ok())
|
||||
.map(|dt| dt.with_timezone(&Utc));
|
||||
|
||||
let error_message = if let Some(wid) = worker_id {
|
||||
format!(
|
||||
@@ -214,24 +219,85 @@ async fn handle_execution_requested(
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
match ExecutionRepository::update(pool, execution_id, update_input).await {
|
||||
Ok(_) => {
|
||||
info!(
|
||||
"Successfully failed execution {} due to worker queue expiration",
|
||||
execution_id
|
||||
);
|
||||
Ok(())
|
||||
if let Some(timestamp) = scheduled_attempt_updated_at {
|
||||
// Guard on both status and the exact updated_at from when the execution was
|
||||
// scheduled — prevents overwriting state that changed after this DLQ message
|
||||
// was enqueued.
|
||||
match ExecutionRepository::update_if_status_and_updated_at(
|
||||
pool,
|
||||
execution_id,
|
||||
ExecutionStatus::Scheduled,
|
||||
timestamp,
|
||||
update_input,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Some(_)) => {
|
||||
info!(
|
||||
"Successfully failed execution {} due to worker queue expiration",
|
||||
execution_id
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
Ok(None) => {
|
||||
info!(
|
||||
"Skipping DLQ failure for execution {} because it already left Scheduled state",
|
||||
execution_id
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to update execution {} to failed state: {}",
|
||||
execution_id, e
|
||||
);
|
||||
Err(attune_common::mq::MqError::Consume(format!(
|
||||
"Failed to update execution: {}",
|
||||
e
|
||||
)))
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to update execution {} to failed state: {}",
|
||||
execution_id, e
|
||||
);
|
||||
// Return error to nack and potentially retry
|
||||
Err(attune_common::mq::MqError::Consume(format!(
|
||||
"Failed to update execution: {}",
|
||||
e
|
||||
)))
|
||||
} else {
|
||||
// Fallback for DLQ messages that predate the scheduled_attempt_updated_at
|
||||
// field. Use a status-only guard — same safety guarantee as the original code
|
||||
// (never overwrites terminal or running state).
|
||||
warn!(
|
||||
"DLQ message for execution {} lacks scheduled_attempt_updated_at; \
|
||||
falling back to status-only guard",
|
||||
execution_id
|
||||
);
|
||||
match ExecutionRepository::update_if_status(
|
||||
pool,
|
||||
execution_id,
|
||||
ExecutionStatus::Scheduled,
|
||||
update_input,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Some(_)) => {
|
||||
info!(
|
||||
"Successfully failed execution {} due to worker queue expiration (status-only guard)",
|
||||
execution_id
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
Ok(None) => {
|
||||
info!(
|
||||
"Skipping DLQ failure for execution {} because it already left Scheduled state",
|
||||
execution_id
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to update execution {} to failed state: {}",
|
||||
execution_id, e
|
||||
);
|
||||
Err(attune_common::mq::MqError::Consume(format!(
|
||||
"Failed to update execution: {}",
|
||||
e
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ use attune_common::{
|
||||
event::{EnforcementRepository, EventRepository, UpdateEnforcementInput},
|
||||
execution::{CreateExecutionInput, ExecutionRepository},
|
||||
rule::RuleRepository,
|
||||
Create, FindById,
|
||||
FindById,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -116,6 +116,14 @@ impl EnforcementProcessor {
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Enforcement not found: {}", enforcement_id))?;
|
||||
|
||||
if enforcement.status != EnforcementStatus::Created {
|
||||
debug!(
|
||||
"Enforcement {} already left Created state ({:?}), skipping duplicate processing",
|
||||
enforcement_id, enforcement.status
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Fetch associated rule
|
||||
let rule = RuleRepository::find_by_id(
|
||||
pool,
|
||||
@@ -135,7 +143,7 @@ impl EnforcementProcessor {
|
||||
|
||||
// Evaluate whether to create execution
|
||||
if Self::should_create_execution(&enforcement, &rule, event.as_ref())? {
|
||||
Self::create_execution(
|
||||
let execution_created = Self::create_execution(
|
||||
pool,
|
||||
publisher,
|
||||
policy_enforcer,
|
||||
@@ -145,10 +153,10 @@ impl EnforcementProcessor {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Update enforcement status to Processed after successful execution creation
|
||||
EnforcementRepository::update_loaded(
|
||||
let updated = EnforcementRepository::update_loaded_if_status(
|
||||
pool,
|
||||
&enforcement,
|
||||
EnforcementStatus::Created,
|
||||
UpdateEnforcementInput {
|
||||
status: Some(EnforcementStatus::Processed),
|
||||
payload: None,
|
||||
@@ -157,17 +165,27 @@ impl EnforcementProcessor {
|
||||
)
|
||||
.await?;
|
||||
|
||||
debug!("Updated enforcement {} status to Processed", enforcement_id);
|
||||
if updated.is_some() {
|
||||
debug!(
|
||||
"Updated enforcement {} status to Processed after {} execution path",
|
||||
enforcement_id,
|
||||
if execution_created {
|
||||
"new"
|
||||
} else {
|
||||
"idempotent"
|
||||
}
|
||||
);
|
||||
}
|
||||
} else {
|
||||
info!(
|
||||
"Skipping execution creation for enforcement: {}",
|
||||
enforcement_id
|
||||
);
|
||||
|
||||
// Update enforcement status to Disabled since it was not actionable
|
||||
EnforcementRepository::update_loaded(
|
||||
let updated = EnforcementRepository::update_loaded_if_status(
|
||||
pool,
|
||||
&enforcement,
|
||||
EnforcementStatus::Created,
|
||||
UpdateEnforcementInput {
|
||||
status: Some(EnforcementStatus::Disabled),
|
||||
payload: None,
|
||||
@@ -176,10 +194,12 @@ impl EnforcementProcessor {
|
||||
)
|
||||
.await?;
|
||||
|
||||
debug!(
|
||||
"Updated enforcement {} status to Disabled (skipped)",
|
||||
enforcement_id
|
||||
);
|
||||
if updated.is_some() {
|
||||
debug!(
|
||||
"Updated enforcement {} status to Disabled (skipped)",
|
||||
enforcement_id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -234,7 +254,7 @@ impl EnforcementProcessor {
|
||||
_queue_manager: &ExecutionQueueManager,
|
||||
enforcement: &Enforcement,
|
||||
rule: &Rule,
|
||||
) -> Result<()> {
|
||||
) -> Result<bool> {
|
||||
// Extract action ID — should_create_execution already verified it's Some,
|
||||
// but guard defensively here as well.
|
||||
let action_id = match rule.action {
|
||||
@@ -275,44 +295,60 @@ impl EnforcementProcessor {
|
||||
workflow_task: None, // Non-workflow execution
|
||||
};
|
||||
|
||||
let execution = ExecutionRepository::create(pool, execution_input).await?;
|
||||
let execution_result = ExecutionRepository::create_top_level_for_enforcement_if_absent(
|
||||
pool,
|
||||
execution_input,
|
||||
enforcement.id,
|
||||
)
|
||||
.await?;
|
||||
let execution = execution_result.execution;
|
||||
|
||||
info!(
|
||||
"Created execution: {} for enforcement: {}",
|
||||
execution.id, enforcement.id
|
||||
);
|
||||
if execution_result.created {
|
||||
info!(
|
||||
"Created execution: {} for enforcement: {}",
|
||||
execution.id, enforcement.id
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"Reusing execution: {} for enforcement: {}",
|
||||
execution.id, enforcement.id
|
||||
);
|
||||
}
|
||||
|
||||
// Publish ExecutionRequested message
|
||||
let payload = ExecutionRequestedPayload {
|
||||
execution_id: execution.id,
|
||||
action_id: Some(action_id),
|
||||
action_ref: action_ref.clone(),
|
||||
parent_id: None,
|
||||
enforcement_id: Some(enforcement.id),
|
||||
config: enforcement.config.clone(),
|
||||
};
|
||||
if execution_result.created
|
||||
|| execution.status == attune_common::models::enums::ExecutionStatus::Requested
|
||||
{
|
||||
let payload = ExecutionRequestedPayload {
|
||||
execution_id: execution.id,
|
||||
action_id: Some(action_id),
|
||||
action_ref: action_ref.clone(),
|
||||
parent_id: None,
|
||||
enforcement_id: Some(enforcement.id),
|
||||
config: execution.config.clone(),
|
||||
};
|
||||
|
||||
let envelope =
|
||||
MessageEnvelope::new(attune_common::mq::MessageType::ExecutionRequested, payload)
|
||||
.with_source("executor");
|
||||
let envelope =
|
||||
MessageEnvelope::new(attune_common::mq::MessageType::ExecutionRequested, payload)
|
||||
.with_source("executor");
|
||||
|
||||
// Publish to execution requests queue with routing key
|
||||
let routing_key = "execution.requested";
|
||||
let exchange = "attune.executions";
|
||||
// Publish to execution requests queue with routing key
|
||||
let routing_key = "execution.requested";
|
||||
let exchange = "attune.executions";
|
||||
|
||||
publisher
|
||||
.publish_envelope_with_routing(&envelope, exchange, routing_key)
|
||||
.await?;
|
||||
publisher
|
||||
.publish_envelope_with_routing(&envelope, exchange, routing_key)
|
||||
.await?;
|
||||
|
||||
info!(
|
||||
"Published execution.requested message for execution: {} (enforcement: {}, action: {})",
|
||||
execution.id, enforcement.id, action_id
|
||||
);
|
||||
info!(
|
||||
"Published execution.requested message for execution: {} (enforcement: {}, action: {})",
|
||||
execution.id, enforcement.id, action_id
|
||||
);
|
||||
}
|
||||
|
||||
// NOTE: Queue slot will be released when worker publishes execution.completed
|
||||
// and CompletionListener calls queue_manager.notify_completion(action_id)
|
||||
|
||||
Ok(())
|
||||
Ok(execution_result.created)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ use attune_common::{
|
||||
event::{CreateEnforcementInput, EnforcementRepository, EventRepository},
|
||||
pack::PackRepository,
|
||||
rule::RuleRepository,
|
||||
Create, FindById, List,
|
||||
FindById, List,
|
||||
},
|
||||
template_resolver::{resolve_templates, TemplateContext},
|
||||
};
|
||||
@@ -206,32 +206,43 @@ impl EventProcessor {
|
||||
conditions: rule.conditions.clone(),
|
||||
};
|
||||
|
||||
let enforcement = EnforcementRepository::create(pool, create_input).await?;
|
||||
let enforcement_result =
|
||||
EnforcementRepository::create_or_get_by_rule_event(pool, create_input).await?;
|
||||
let enforcement = enforcement_result.enforcement;
|
||||
|
||||
info!(
|
||||
"Enforcement {} created for rule {} (event: {})",
|
||||
enforcement.id, rule.r#ref, event.id
|
||||
);
|
||||
if enforcement_result.created {
|
||||
info!(
|
||||
"Enforcement {} created for rule {} (event: {})",
|
||||
enforcement.id, rule.r#ref, event.id
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"Reusing enforcement {} for rule {} (event: {})",
|
||||
enforcement.id, rule.r#ref, event.id
|
||||
);
|
||||
}
|
||||
|
||||
// Publish EnforcementCreated message
|
||||
let enforcement_payload = EnforcementCreatedPayload {
|
||||
enforcement_id: enforcement.id,
|
||||
rule_id: Some(rule.id),
|
||||
rule_ref: rule.r#ref.clone(),
|
||||
event_id: Some(event.id),
|
||||
trigger_ref: event.trigger_ref.clone(),
|
||||
payload: payload.clone(),
|
||||
};
|
||||
if enforcement_result.created || enforcement.status == EnforcementStatus::Created {
|
||||
let enforcement_payload = EnforcementCreatedPayload {
|
||||
enforcement_id: enforcement.id,
|
||||
rule_id: Some(rule.id),
|
||||
rule_ref: rule.r#ref.clone(),
|
||||
event_id: Some(event.id),
|
||||
trigger_ref: event.trigger_ref.clone(),
|
||||
payload: payload.clone(),
|
||||
};
|
||||
|
||||
let envelope = MessageEnvelope::new(MessageType::EnforcementCreated, enforcement_payload)
|
||||
.with_source("event-processor");
|
||||
let envelope =
|
||||
MessageEnvelope::new(MessageType::EnforcementCreated, enforcement_payload)
|
||||
.with_source("event-processor");
|
||||
|
||||
publisher.publish_envelope(&envelope).await?;
|
||||
publisher.publish_envelope(&envelope).await?;
|
||||
|
||||
debug!(
|
||||
"Published EnforcementCreated message for enforcement {}",
|
||||
enforcement.id
|
||||
);
|
||||
debug!(
|
||||
"Published EnforcementCreated message for enforcement {}",
|
||||
enforcement.id
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -9,13 +9,14 @@
|
||||
|
||||
use anyhow::Result;
|
||||
use attune_common::{
|
||||
error::Error as AttuneError,
|
||||
models::{enums::InquiryStatus, inquiry::Inquiry, Execution, Id},
|
||||
mq::{
|
||||
Consumer, InquiryCreatedPayload, InquiryRespondedPayload, MessageEnvelope, MessageType,
|
||||
Publisher,
|
||||
},
|
||||
repositories::{
|
||||
execution::{ExecutionRepository, UpdateExecutionInput},
|
||||
execution::{ExecutionRepository, UpdateExecutionInput, SELECT_COLUMNS},
|
||||
inquiry::{CreateInquiryInput, InquiryRepository},
|
||||
Create, FindById, Update,
|
||||
},
|
||||
@@ -28,6 +29,8 @@ use tracing::{debug, error, info, warn};
|
||||
|
||||
/// Special key in action result to indicate an inquiry should be created
|
||||
pub const INQUIRY_RESULT_KEY: &str = "__inquiry";
|
||||
const INQUIRY_ID_RESULT_KEY: &str = "__inquiry_id";
|
||||
const INQUIRY_CREATED_PUBLISHED_RESULT_KEY: &str = "__inquiry_created_published";
|
||||
|
||||
/// Structure for inquiry data in action results
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
@@ -104,69 +107,198 @@ impl InquiryHandler {
|
||||
let inquiry_request: InquiryRequest = serde_json::from_value(inquiry_value.clone())?;
|
||||
Ok(inquiry_request)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true when `e` represents a PostgreSQL unique constraint violation (code 23505).
|
||||
fn is_db_unique_violation(e: &AttuneError) -> bool {
|
||||
if let AttuneError::Database(sqlx_err) = e {
|
||||
return sqlx_err
|
||||
.as_database_error()
|
||||
.and_then(|db| db.code())
|
||||
.as_deref()
|
||||
== Some("23505");
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
impl InquiryHandler {
|
||||
/// Create an inquiry for an execution and pause it
|
||||
pub async fn create_inquiry_from_result(
|
||||
pool: &PgPool,
|
||||
publisher: &Publisher,
|
||||
execution_id: Id,
|
||||
result: &JsonValue,
|
||||
_result: &JsonValue,
|
||||
) -> Result<Inquiry> {
|
||||
info!("Creating inquiry for execution {}", execution_id);
|
||||
|
||||
// Extract inquiry request
|
||||
let inquiry_request = Self::extract_inquiry_request(result)?;
|
||||
let mut tx = pool.begin().await?;
|
||||
let execution = sqlx::query_as::<_, Execution>(&format!(
|
||||
"SELECT {SELECT_COLUMNS} FROM execution WHERE id = $1 FOR UPDATE"
|
||||
))
|
||||
.bind(execution_id)
|
||||
.fetch_one(&mut *tx)
|
||||
.await?;
|
||||
|
||||
// Calculate timeout if specified
|
||||
let mut result = execution
|
||||
.result
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution {} has no result", execution_id))?;
|
||||
let inquiry_request = Self::extract_inquiry_request(&result)?;
|
||||
let timeout_at = inquiry_request
|
||||
.timeout_seconds
|
||||
.map(|seconds| Utc::now() + chrono::Duration::seconds(seconds));
|
||||
|
||||
// Create inquiry in database
|
||||
let inquiry_input = CreateInquiryInput {
|
||||
execution: execution_id,
|
||||
prompt: inquiry_request.prompt.clone(),
|
||||
response_schema: inquiry_request.response_schema.clone(),
|
||||
assigned_to: inquiry_request.assigned_to,
|
||||
status: InquiryStatus::Pending,
|
||||
response: None,
|
||||
timeout_at,
|
||||
let existing_inquiry_id = result
|
||||
.get(INQUIRY_ID_RESULT_KEY)
|
||||
.and_then(|value| value.as_i64());
|
||||
let published = result
|
||||
.get(INQUIRY_CREATED_PUBLISHED_RESULT_KEY)
|
||||
.and_then(|value| value.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let (inquiry, should_publish) = if let Some(inquiry_id) = existing_inquiry_id {
|
||||
let inquiry = InquiryRepository::find_by_id(&mut *tx, inquiry_id)
|
||||
.await?
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"Inquiry {} referenced by execution {} result not found",
|
||||
inquiry_id,
|
||||
execution_id
|
||||
)
|
||||
})?;
|
||||
let should_publish = !published && inquiry.status == InquiryStatus::Pending;
|
||||
(inquiry, should_publish)
|
||||
} else {
|
||||
let create_result = InquiryRepository::create(
|
||||
&mut *tx,
|
||||
CreateInquiryInput {
|
||||
execution: execution_id,
|
||||
prompt: inquiry_request.prompt.clone(),
|
||||
response_schema: inquiry_request.response_schema.clone(),
|
||||
assigned_to: inquiry_request.assigned_to,
|
||||
status: InquiryStatus::Pending,
|
||||
response: None,
|
||||
timeout_at,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let inquiry = match create_result {
|
||||
Ok(inq) => inq,
|
||||
Err(e) => {
|
||||
// Unique constraint violation (23505): another replica already
|
||||
// created the inquiry for this execution. Treat as idempotent
|
||||
// success — drop the aborted transaction and return the existing row.
|
||||
if is_db_unique_violation(&e) {
|
||||
info!(
|
||||
"Inquiry for execution {} already created by another replica \
|
||||
(unique constraint 23505); treating as idempotent",
|
||||
execution_id
|
||||
);
|
||||
// tx is in an aborted state; dropping it issues ROLLBACK.
|
||||
drop(tx);
|
||||
let inquiries =
|
||||
InquiryRepository::find_by_execution(pool, execution_id).await?;
|
||||
let existing = inquiries.into_iter().next().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"Inquiry for execution {} not found after unique constraint violation",
|
||||
execution_id
|
||||
)
|
||||
})?;
|
||||
return Ok(existing);
|
||||
}
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
Self::set_inquiry_result_metadata(&mut result, inquiry.id, false)?;
|
||||
ExecutionRepository::update(
|
||||
&mut *tx,
|
||||
execution_id,
|
||||
UpdateExecutionInput {
|
||||
result: Some(result),
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
(inquiry, true)
|
||||
};
|
||||
|
||||
let inquiry = InquiryRepository::create(pool, inquiry_input).await?;
|
||||
tx.commit().await?;
|
||||
|
||||
info!(
|
||||
"Created inquiry {} for execution {}",
|
||||
inquiry.id, execution_id
|
||||
);
|
||||
if should_publish {
|
||||
let payload = InquiryCreatedPayload {
|
||||
inquiry_id: inquiry.id,
|
||||
execution_id,
|
||||
prompt: inquiry_request.prompt,
|
||||
response_schema: inquiry_request.response_schema,
|
||||
assigned_to: inquiry_request.assigned_to,
|
||||
timeout_at,
|
||||
};
|
||||
|
||||
// Update execution status to paused/waiting
|
||||
// Note: We use a special status or keep it as "running" with inquiry tracking
|
||||
// For now, we'll keep status as-is and track via inquiry relationship
|
||||
let envelope =
|
||||
MessageEnvelope::new(MessageType::InquiryCreated, payload).with_source("executor");
|
||||
|
||||
// Publish InquiryCreated message
|
||||
let payload = InquiryCreatedPayload {
|
||||
inquiry_id: inquiry.id,
|
||||
execution_id,
|
||||
prompt: inquiry_request.prompt,
|
||||
response_schema: inquiry_request.response_schema,
|
||||
assigned_to: inquiry_request.assigned_to,
|
||||
timeout_at,
|
||||
};
|
||||
publisher.publish_envelope(&envelope).await?;
|
||||
Self::mark_inquiry_created_published(pool, execution_id).await?;
|
||||
|
||||
let envelope =
|
||||
MessageEnvelope::new(MessageType::InquiryCreated, payload).with_source("executor");
|
||||
|
||||
publisher.publish_envelope(&envelope).await?;
|
||||
|
||||
debug!(
|
||||
"Published InquiryCreated message for inquiry {}",
|
||||
inquiry.id
|
||||
);
|
||||
debug!(
|
||||
"Published InquiryCreated message for inquiry {}",
|
||||
inquiry.id
|
||||
);
|
||||
}
|
||||
|
||||
Ok(inquiry)
|
||||
}
|
||||
|
||||
fn set_inquiry_result_metadata(
|
||||
result: &mut JsonValue,
|
||||
inquiry_id: Id,
|
||||
published: bool,
|
||||
) -> Result<()> {
|
||||
let obj = result
|
||||
.as_object_mut()
|
||||
.ok_or_else(|| anyhow::anyhow!("execution result is not a JSON object"))?;
|
||||
|
||||
obj.insert(
|
||||
INQUIRY_ID_RESULT_KEY.to_string(),
|
||||
JsonValue::Number(inquiry_id.into()),
|
||||
);
|
||||
obj.insert(
|
||||
INQUIRY_CREATED_PUBLISHED_RESULT_KEY.to_string(),
|
||||
JsonValue::Bool(published),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn mark_inquiry_created_published(pool: &PgPool, execution_id: Id) -> Result<()> {
|
||||
let execution = ExecutionRepository::find_by_id(pool, execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution {} not found", execution_id))?;
|
||||
let mut result = execution
|
||||
.result
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution {} has no result", execution_id))?;
|
||||
let inquiry_id = result
|
||||
.get(INQUIRY_ID_RESULT_KEY)
|
||||
.and_then(|value| value.as_i64())
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution {} missing __inquiry_id", execution_id))?;
|
||||
|
||||
Self::set_inquiry_result_metadata(&mut result, inquiry_id, true)?;
|
||||
ExecutionRepository::update(
|
||||
pool,
|
||||
execution_id,
|
||||
UpdateExecutionInput {
|
||||
result: Some(result),
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle an inquiry response message
|
||||
async fn handle_inquiry_response(
|
||||
pool: &PgPool,
|
||||
@@ -235,9 +367,13 @@ impl InquiryHandler {
|
||||
if let Some(obj) = updated_result.as_object_mut() {
|
||||
obj.insert("__inquiry_response".to_string(), response.clone());
|
||||
obj.insert(
|
||||
"__inquiry_id".to_string(),
|
||||
INQUIRY_ID_RESULT_KEY.to_string(),
|
||||
JsonValue::Number(inquiry.id.into()),
|
||||
);
|
||||
obj.insert(
|
||||
INQUIRY_CREATED_PUBLISHED_RESULT_KEY.to_string(),
|
||||
JsonValue::Bool(true),
|
||||
);
|
||||
}
|
||||
|
||||
// Update execution with new result
|
||||
|
||||
@@ -933,8 +933,8 @@ mod tests {
|
||||
assert_eq!(enforcer.get_concurrency_limit(2, Some(200)), Some(20));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_parameter_group_key_uses_exact_values() {
|
||||
#[tokio::test]
|
||||
async fn test_build_parameter_group_key_uses_exact_values() {
|
||||
let pool = sqlx::PgPool::connect_lazy("postgresql://localhost/test").unwrap();
|
||||
let enforcer = PolicyEnforcer::new(pool);
|
||||
let config = serde_json::json!({
|
||||
|
||||
@@ -23,7 +23,13 @@ use tokio::time::{sleep, Duration, Instant};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use attune_common::models::Id;
|
||||
use attune_common::repositories::queue_stats::{QueueStatsRepository, UpsertQueueStatsInput};
|
||||
use attune_common::repositories::{
|
||||
execution_admission::{
|
||||
AdmissionEnqueueOutcome, AdmissionQueueStats, AdmissionQueuedRemovalOutcome,
|
||||
AdmissionSlotReleaseOutcome, ExecutionAdmissionRepository,
|
||||
},
|
||||
queue_stats::{QueueStatsRepository, UpsertQueueStatsInput},
|
||||
};
|
||||
|
||||
/// Configuration for the queue manager
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -51,6 +57,8 @@ impl Default for QueueConfig {
|
||||
struct QueueEntry {
|
||||
/// Execution or enforcement ID being queued
|
||||
execution_id: Id,
|
||||
/// Durable FIFO position for the DB-backed admission path.
|
||||
queue_order: Option<i64>,
|
||||
/// When this entry was added to the queue
|
||||
enqueued_at: DateTime<Utc>,
|
||||
}
|
||||
@@ -224,6 +232,12 @@ impl ExecutionQueueManager {
|
||||
max_concurrent: u32,
|
||||
group_key: Option<String>,
|
||||
) -> Result<()> {
|
||||
if self.db_pool.is_some() {
|
||||
return self
|
||||
.enqueue_and_wait_db(action_id, execution_id, max_concurrent, group_key)
|
||||
.await;
|
||||
}
|
||||
|
||||
if self.active_execution_keys.contains_key(&execution_id) {
|
||||
debug!(
|
||||
"Execution {} already owns an active slot, skipping queue wait",
|
||||
@@ -311,6 +325,7 @@ impl ExecutionQueueManager {
|
||||
// Add to queue
|
||||
let entry = QueueEntry {
|
||||
execution_id,
|
||||
queue_order: None,
|
||||
enqueued_at: Utc::now(),
|
||||
};
|
||||
|
||||
@@ -392,6 +407,24 @@ impl ExecutionQueueManager {
|
||||
max_concurrent: u32,
|
||||
group_key: Option<String>,
|
||||
) -> Result<SlotEnqueueOutcome> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
return Ok(
|
||||
match ExecutionAdmissionRepository::enqueue(
|
||||
pool,
|
||||
self.config.max_queue_length,
|
||||
action_id,
|
||||
execution_id,
|
||||
max_concurrent,
|
||||
group_key,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
AdmissionEnqueueOutcome::Acquired => SlotEnqueueOutcome::Acquired,
|
||||
AdmissionEnqueueOutcome::Enqueued => SlotEnqueueOutcome::Enqueued,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if self.active_execution_keys.contains_key(&execution_id) {
|
||||
debug!(
|
||||
"Execution {} already owns an active slot, treating as acquired",
|
||||
@@ -463,6 +496,7 @@ impl ExecutionQueueManager {
|
||||
|
||||
queue.queue.push_back(QueueEntry {
|
||||
execution_id,
|
||||
queue_order: None,
|
||||
enqueued_at: Utc::now(),
|
||||
});
|
||||
queue.total_enqueued += 1;
|
||||
@@ -480,6 +514,21 @@ impl ExecutionQueueManager {
|
||||
max_concurrent: u32,
|
||||
group_key: Option<String>,
|
||||
) -> Result<SlotAcquireOutcome> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
let outcome = ExecutionAdmissionRepository::try_acquire(
|
||||
pool,
|
||||
action_id,
|
||||
execution_id,
|
||||
max_concurrent,
|
||||
group_key,
|
||||
)
|
||||
.await?;
|
||||
return Ok(SlotAcquireOutcome {
|
||||
acquired: outcome.acquired,
|
||||
current_count: outcome.current_count,
|
||||
});
|
||||
}
|
||||
|
||||
let queue_key = self.queue_key(action_id, group_key);
|
||||
let queue_arc = self
|
||||
.get_or_create_queue(queue_key.clone(), max_concurrent)
|
||||
@@ -530,6 +579,14 @@ impl ExecutionQueueManager {
|
||||
&self,
|
||||
execution_id: Id,
|
||||
) -> Result<Option<SlotReleaseOutcome>> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
return Ok(
|
||||
ExecutionAdmissionRepository::release_active_slot(pool, execution_id)
|
||||
.await?
|
||||
.map(Self::map_release_outcome),
|
||||
);
|
||||
}
|
||||
|
||||
let Some((_, queue_key)) = self.active_execution_keys.remove(&execution_id) else {
|
||||
debug!(
|
||||
"No active queue slot found for execution {} (queue may have been cleared)",
|
||||
@@ -610,6 +667,16 @@ impl ExecutionQueueManager {
|
||||
execution_id: Id,
|
||||
outcome: &SlotReleaseOutcome,
|
||||
) -> Result<()> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
ExecutionAdmissionRepository::restore_active_slot(
|
||||
pool,
|
||||
execution_id,
|
||||
&Self::to_admission_release_outcome(outcome),
|
||||
)
|
||||
.await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let action_id = outcome.queue_key.action_id;
|
||||
let queue_arc = self.get_or_create_queue(outcome.queue_key.clone(), 1).await;
|
||||
let mut queue = queue_arc.lock().await;
|
||||
@@ -630,6 +697,14 @@ impl ExecutionQueueManager {
|
||||
&self,
|
||||
execution_id: Id,
|
||||
) -> Result<Option<QueuedRemovalOutcome>> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
return Ok(
|
||||
ExecutionAdmissionRepository::remove_queued_execution(pool, execution_id)
|
||||
.await?
|
||||
.map(Self::map_removal_outcome),
|
||||
);
|
||||
}
|
||||
|
||||
for entry in self.queues.iter() {
|
||||
let queue_key = entry.key().clone();
|
||||
let queue_arc = entry.value().clone();
|
||||
@@ -666,6 +741,15 @@ impl ExecutionQueueManager {
|
||||
}
|
||||
|
||||
pub async fn restore_queued_execution(&self, outcome: &QueuedRemovalOutcome) -> Result<()> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
ExecutionAdmissionRepository::restore_queued_execution(
|
||||
pool,
|
||||
&Self::to_admission_removal_outcome(outcome),
|
||||
)
|
||||
.await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let action_id = outcome.queue_key.action_id;
|
||||
let queue_arc = self.get_or_create_queue(outcome.queue_key.clone(), 1).await;
|
||||
let mut queue = queue_arc.lock().await;
|
||||
@@ -709,6 +793,19 @@ impl ExecutionQueueManager {
|
||||
|
||||
/// Get statistics for a specific action's queue
|
||||
pub async fn get_queue_stats(&self, action_id: Id) -> Option<QueueStats> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
return ExecutionAdmissionRepository::get_queue_stats(pool, action_id)
|
||||
.await
|
||||
.map(|stats| stats.map(Self::map_queue_stats))
|
||||
.unwrap_or_else(|err| {
|
||||
warn!(
|
||||
"Failed to load shared queue stats for action {}: {}",
|
||||
action_id, err
|
||||
);
|
||||
None
|
||||
});
|
||||
}
|
||||
|
||||
let queue_arcs: Vec<Arc<Mutex<ActionQueue>>> = self
|
||||
.queues
|
||||
.iter()
|
||||
@@ -757,6 +854,26 @@ impl ExecutionQueueManager {
|
||||
/// Get statistics for all queues
|
||||
#[allow(dead_code)]
|
||||
pub async fn get_all_queue_stats(&self) -> Vec<QueueStats> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
return QueueStatsRepository::list_all(pool)
|
||||
.await
|
||||
.map(|stats| {
|
||||
stats
|
||||
.into_iter()
|
||||
.map(|stat| QueueStats {
|
||||
action_id: stat.action_id,
|
||||
queue_length: stat.queue_length as usize,
|
||||
active_count: stat.active_count as u32,
|
||||
max_concurrent: stat.max_concurrent as u32,
|
||||
oldest_enqueued_at: stat.oldest_enqueued_at,
|
||||
total_enqueued: stat.total_enqueued as u64,
|
||||
total_completed: stat.total_completed as u64,
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
}
|
||||
|
||||
let mut stats = Vec::new();
|
||||
|
||||
let mut action_ids = std::collections::BTreeSet::new();
|
||||
@@ -787,6 +904,14 @@ impl ExecutionQueueManager {
|
||||
/// * `Ok(false)` - Execution not found in queue
|
||||
#[allow(dead_code)]
|
||||
pub async fn cancel_execution(&self, action_id: Id, execution_id: Id) -> Result<bool> {
|
||||
if let Some(pool) = &self.db_pool {
|
||||
return Ok(
|
||||
ExecutionAdmissionRepository::remove_queued_execution(pool, execution_id)
|
||||
.await?
|
||||
.is_some(),
|
||||
);
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Attempting to cancel execution {} for action {}",
|
||||
execution_id, action_id
|
||||
@@ -838,12 +963,147 @@ impl ExecutionQueueManager {
|
||||
/// Get the number of actions with active queues
|
||||
#[allow(dead_code)]
|
||||
pub fn active_queue_count(&self) -> usize {
|
||||
if self.db_pool.is_some() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
self.queues
|
||||
.iter()
|
||||
.map(|entry| entry.key().action_id)
|
||||
.collect::<std::collections::BTreeSet<_>>()
|
||||
.len()
|
||||
}
|
||||
|
||||
async fn enqueue_and_wait_db(
|
||||
&self,
|
||||
action_id: Id,
|
||||
execution_id: Id,
|
||||
max_concurrent: u32,
|
||||
group_key: Option<String>,
|
||||
) -> Result<()> {
|
||||
let pool = self
|
||||
.db_pool
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("database pool required for shared admission"))?;
|
||||
|
||||
match ExecutionAdmissionRepository::enqueue(
|
||||
pool,
|
||||
self.config.max_queue_length,
|
||||
action_id,
|
||||
execution_id,
|
||||
max_concurrent,
|
||||
group_key.clone(),
|
||||
)
|
||||
.await?
|
||||
{
|
||||
AdmissionEnqueueOutcome::Acquired => return Ok(()),
|
||||
AdmissionEnqueueOutcome::Enqueued => {}
|
||||
}
|
||||
|
||||
let deadline = Instant::now() + Duration::from_secs(self.config.queue_timeout_seconds);
|
||||
loop {
|
||||
sleep(Duration::from_millis(10)).await;
|
||||
|
||||
match ExecutionAdmissionRepository::wait_status(pool, execution_id).await? {
|
||||
Some(true) => return Ok(()),
|
||||
Some(false) => {}
|
||||
None => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Queue state for execution {} disappeared while waiting",
|
||||
execution_id
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if Instant::now() < deadline {
|
||||
continue;
|
||||
}
|
||||
|
||||
match ExecutionAdmissionRepository::remove_queued_execution(pool, execution_id).await? {
|
||||
Some(_) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Queue timeout for execution {}: waited {} seconds",
|
||||
execution_id,
|
||||
self.config.queue_timeout_seconds
|
||||
));
|
||||
}
|
||||
None => {
|
||||
if matches!(
|
||||
ExecutionAdmissionRepository::wait_status(pool, execution_id).await?,
|
||||
Some(true)
|
||||
) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
return Err(anyhow::anyhow!(
|
||||
"Queue timeout for execution {}: waited {} seconds",
|
||||
execution_id,
|
||||
self.config.queue_timeout_seconds
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn map_release_outcome(outcome: AdmissionSlotReleaseOutcome) -> SlotReleaseOutcome {
|
||||
SlotReleaseOutcome {
|
||||
next_execution_id: outcome.next_execution_id,
|
||||
queue_key: QueueKey {
|
||||
action_id: outcome.action_id,
|
||||
group_key: outcome.group_key,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn to_admission_release_outcome(outcome: &SlotReleaseOutcome) -> AdmissionSlotReleaseOutcome {
|
||||
AdmissionSlotReleaseOutcome {
|
||||
action_id: outcome.queue_key.action_id,
|
||||
group_key: outcome.queue_key.group_key.clone(),
|
||||
next_execution_id: outcome.next_execution_id,
|
||||
}
|
||||
}
|
||||
|
||||
fn map_removal_outcome(outcome: AdmissionQueuedRemovalOutcome) -> QueuedRemovalOutcome {
|
||||
QueuedRemovalOutcome {
|
||||
next_execution_id: outcome.next_execution_id,
|
||||
queue_key: QueueKey {
|
||||
action_id: outcome.action_id,
|
||||
group_key: outcome.group_key,
|
||||
},
|
||||
removed_entry: QueueEntry {
|
||||
execution_id: outcome.execution_id,
|
||||
queue_order: Some(outcome.queue_order),
|
||||
enqueued_at: outcome.enqueued_at,
|
||||
},
|
||||
removed_index: outcome.removed_index,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_admission_removal_outcome(
|
||||
outcome: &QueuedRemovalOutcome,
|
||||
) -> AdmissionQueuedRemovalOutcome {
|
||||
AdmissionQueuedRemovalOutcome {
|
||||
action_id: outcome.queue_key.action_id,
|
||||
group_key: outcome.queue_key.group_key.clone(),
|
||||
next_execution_id: outcome.next_execution_id,
|
||||
execution_id: outcome.removed_entry.execution_id,
|
||||
queue_order: outcome.removed_entry.queue_order.unwrap_or_default(),
|
||||
enqueued_at: outcome.removed_entry.enqueued_at,
|
||||
removed_index: outcome.removed_index,
|
||||
}
|
||||
}
|
||||
|
||||
fn map_queue_stats(stats: AdmissionQueueStats) -> QueueStats {
|
||||
QueueStats {
|
||||
action_id: stats.action_id,
|
||||
queue_length: stats.queue_length,
|
||||
active_count: stats.active_count,
|
||||
max_concurrent: stats.max_concurrent,
|
||||
oldest_enqueued_at: stats.oldest_enqueued_at,
|
||||
total_enqueued: stats.total_enqueued,
|
||||
total_completed: stats.total_completed,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -25,12 +25,12 @@ use attune_common::{
|
||||
workflow::{
|
||||
CreateWorkflowExecutionInput, WorkflowDefinitionRepository, WorkflowExecutionRepository,
|
||||
},
|
||||
Create, FindById, FindByRef, Update,
|
||||
FindById, FindByRef, Update,
|
||||
},
|
||||
runtime_detection::runtime_aliases_contain,
|
||||
workflow::WorkflowDefinition,
|
||||
};
|
||||
use chrono::Utc;
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
use sqlx::{PgConnection, PgPool};
|
||||
@@ -102,6 +102,17 @@ struct ExecutionScheduledPayload {
|
||||
worker_id: i64,
|
||||
action_ref: String,
|
||||
config: Option<JsonValue>,
|
||||
scheduled_attempt_updated_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct PendingExecutionRequested {
|
||||
execution_id: i64,
|
||||
action_id: i64,
|
||||
action_ref: String,
|
||||
parent_id: i64,
|
||||
enforcement_id: Option<i64>,
|
||||
config: Option<JsonValue>,
|
||||
}
|
||||
|
||||
/// Execution scheduler that routes executions to workers
|
||||
@@ -509,6 +520,7 @@ impl ExecutionScheduler {
|
||||
&worker.id,
|
||||
&envelope.payload.action_ref,
|
||||
&execution_config,
|
||||
scheduled_execution.updated,
|
||||
&action,
|
||||
)
|
||||
.await
|
||||
@@ -1021,13 +1033,13 @@ impl ExecutionScheduler {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn dispatch_workflow_task_with_conn(
|
||||
conn: &mut PgConnection,
|
||||
publisher: &Publisher,
|
||||
_round_robin_counter: &AtomicUsize,
|
||||
parent_execution: &Execution,
|
||||
workflow_execution_id: &i64,
|
||||
task_node: &crate::workflow::graph::TaskNode,
|
||||
wf_ctx: &WorkflowContext,
|
||||
triggered_by: Option<&str>,
|
||||
pending_messages: &mut Vec<PendingExecutionRequested>,
|
||||
) -> Result<()> {
|
||||
let action_ref: String = match &task_node.action {
|
||||
Some(a) => a.clone(),
|
||||
@@ -1059,7 +1071,6 @@ impl ExecutionScheduler {
|
||||
if let Some(ref with_items_expr) = task_node.with_items {
|
||||
return Self::dispatch_with_items_task_with_conn(
|
||||
conn,
|
||||
publisher,
|
||||
parent_execution,
|
||||
workflow_execution_id,
|
||||
task_node,
|
||||
@@ -1068,6 +1079,7 @@ impl ExecutionScheduler {
|
||||
with_items_expr,
|
||||
wf_ctx,
|
||||
triggered_by,
|
||||
pending_messages,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
@@ -1115,36 +1127,29 @@ impl ExecutionScheduler {
|
||||
completed_at: None,
|
||||
};
|
||||
|
||||
let child_execution = if let Some(existing) = ExecutionRepository::find_by_workflow_task(
|
||||
let child_execution_result = ExecutionRepository::create_workflow_task_if_absent_with_conn(
|
||||
&mut *conn,
|
||||
CreateExecutionInput {
|
||||
action: Some(task_action.id),
|
||||
action_ref: action_ref.clone(),
|
||||
config: task_config,
|
||||
env_vars: parent_execution.env_vars.clone(),
|
||||
parent: Some(parent_execution.id),
|
||||
enforcement: parent_execution.enforcement,
|
||||
executor: None,
|
||||
worker: None,
|
||||
status: ExecutionStatus::Requested,
|
||||
result: None,
|
||||
workflow_task: Some(workflow_task),
|
||||
},
|
||||
*workflow_execution_id,
|
||||
&task_node.name,
|
||||
None,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
existing
|
||||
} else {
|
||||
ExecutionRepository::create(
|
||||
&mut *conn,
|
||||
CreateExecutionInput {
|
||||
action: Some(task_action.id),
|
||||
action_ref: action_ref.clone(),
|
||||
config: task_config,
|
||||
env_vars: parent_execution.env_vars.clone(),
|
||||
parent: Some(parent_execution.id),
|
||||
enforcement: parent_execution.enforcement,
|
||||
executor: None,
|
||||
worker: None,
|
||||
status: ExecutionStatus::Requested,
|
||||
result: None,
|
||||
workflow_task: Some(workflow_task),
|
||||
},
|
||||
)
|
||||
.await?
|
||||
};
|
||||
.await?;
|
||||
let child_execution = child_execution_result.execution;
|
||||
|
||||
if child_execution.status == ExecutionStatus::Requested {
|
||||
if child_execution_result.created {
|
||||
info!(
|
||||
"Created child execution {} for workflow task '{}' (action '{}', workflow_execution {})",
|
||||
child_execution.id, task_node.name, action_ref, workflow_execution_id
|
||||
@@ -1157,24 +1162,14 @@ impl ExecutionScheduler {
|
||||
}
|
||||
|
||||
if child_execution.status == ExecutionStatus::Requested {
|
||||
let payload = ExecutionRequestedPayload {
|
||||
pending_messages.push(PendingExecutionRequested {
|
||||
execution_id: child_execution.id,
|
||||
action_id: Some(task_action.id),
|
||||
action_id: task_action.id,
|
||||
action_ref: action_ref.clone(),
|
||||
parent_id: Some(parent_execution.id),
|
||||
parent_id: parent_execution.id,
|
||||
enforcement_id: parent_execution.enforcement,
|
||||
config: child_execution.config.clone(),
|
||||
};
|
||||
|
||||
let envelope = MessageEnvelope::new(MessageType::ExecutionRequested, payload)
|
||||
.with_source("executor-scheduler");
|
||||
|
||||
publisher.publish_envelope(&envelope).await?;
|
||||
|
||||
info!(
|
||||
"Published ExecutionRequested for child execution {} (task '{}')",
|
||||
child_execution.id, task_node.name
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1392,7 +1387,6 @@ impl ExecutionScheduler {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn dispatch_with_items_task_with_conn(
|
||||
conn: &mut PgConnection,
|
||||
publisher: &Publisher,
|
||||
parent_execution: &Execution,
|
||||
workflow_execution_id: &i64,
|
||||
task_node: &crate::workflow::graph::TaskNode,
|
||||
@@ -1401,6 +1395,7 @@ impl ExecutionScheduler {
|
||||
with_items_expr: &str,
|
||||
wf_ctx: &WorkflowContext,
|
||||
triggered_by: Option<&str>,
|
||||
pending_messages: &mut Vec<PendingExecutionRequested>,
|
||||
) -> Result<()> {
|
||||
let items_value = wf_ctx
|
||||
.render_json(&JsonValue::String(with_items_expr.to_string()))
|
||||
@@ -1511,18 +1506,8 @@ impl ExecutionScheduler {
|
||||
completed_at: None,
|
||||
};
|
||||
|
||||
let child_execution = if let Some(existing) =
|
||||
ExecutionRepository::find_by_workflow_task(
|
||||
&mut *conn,
|
||||
*workflow_execution_id,
|
||||
&task_node.name,
|
||||
Some(index as i32),
|
||||
)
|
||||
.await?
|
||||
{
|
||||
existing
|
||||
} else {
|
||||
ExecutionRepository::create(
|
||||
let child_execution_result =
|
||||
ExecutionRepository::create_workflow_task_if_absent_with_conn(
|
||||
&mut *conn,
|
||||
CreateExecutionInput {
|
||||
action: Some(task_action.id),
|
||||
@@ -1537,11 +1522,14 @@ impl ExecutionScheduler {
|
||||
result: None,
|
||||
workflow_task: Some(workflow_task),
|
||||
},
|
||||
*workflow_execution_id,
|
||||
&task_node.name,
|
||||
Some(index as i32),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
.await?;
|
||||
let child_execution = child_execution_result.execution;
|
||||
|
||||
if child_execution.status == ExecutionStatus::Requested {
|
||||
if child_execution_result.created {
|
||||
info!(
|
||||
"Created with_items child execution {} for task '{}' item {} \
|
||||
(action '{}', workflow_execution {})",
|
||||
@@ -1566,11 +1554,11 @@ impl ExecutionScheduler {
|
||||
if child.status == ExecutionStatus::Requested {
|
||||
Self::publish_execution_requested_with_conn(
|
||||
&mut *conn,
|
||||
publisher,
|
||||
child_id,
|
||||
task_action.id,
|
||||
action_ref,
|
||||
parent_execution,
|
||||
pending_messages,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1622,25 +1610,17 @@ impl ExecutionScheduler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn publish_execution_requested_with_conn(
|
||||
conn: &mut PgConnection,
|
||||
async fn publish_execution_requested_payload(
|
||||
publisher: &Publisher,
|
||||
execution_id: i64,
|
||||
action_id: i64,
|
||||
action_ref: &str,
|
||||
parent_execution: &Execution,
|
||||
pending: PendingExecutionRequested,
|
||||
) -> Result<()> {
|
||||
let child = ExecutionRepository::find_by_id(&mut *conn, execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution {} not found", execution_id))?;
|
||||
|
||||
let payload = ExecutionRequestedPayload {
|
||||
execution_id: child.id,
|
||||
action_id: Some(action_id),
|
||||
action_ref: action_ref.to_string(),
|
||||
parent_id: Some(parent_execution.id),
|
||||
enforcement_id: parent_execution.enforcement,
|
||||
config: child.config.clone(),
|
||||
execution_id: pending.execution_id,
|
||||
action_id: Some(pending.action_id),
|
||||
action_ref: pending.action_ref,
|
||||
parent_id: Some(pending.parent_id),
|
||||
enforcement_id: pending.enforcement_id,
|
||||
config: pending.config,
|
||||
};
|
||||
|
||||
let envelope = MessageEnvelope::new(MessageType::ExecutionRequested, payload)
|
||||
@@ -1650,12 +1630,36 @@ impl ExecutionScheduler {
|
||||
|
||||
debug!(
|
||||
"Published deferred ExecutionRequested for child execution {}",
|
||||
execution_id
|
||||
envelope.payload.execution_id
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn publish_execution_requested_with_conn(
|
||||
conn: &mut PgConnection,
|
||||
execution_id: i64,
|
||||
action_id: i64,
|
||||
action_ref: &str,
|
||||
parent_execution: &Execution,
|
||||
pending_messages: &mut Vec<PendingExecutionRequested>,
|
||||
) -> Result<()> {
|
||||
let child = ExecutionRepository::find_by_id(&mut *conn, execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution {} not found", execution_id))?;
|
||||
|
||||
pending_messages.push(PendingExecutionRequested {
|
||||
execution_id: child.id,
|
||||
action_id,
|
||||
action_ref: action_ref.to_string(),
|
||||
parent_id: parent_execution.id,
|
||||
enforcement_id: parent_execution.enforcement,
|
||||
config: child.config.clone(),
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Publish the next `Requested`-status with_items siblings to fill freed
|
||||
/// concurrency slots.
|
||||
///
|
||||
@@ -1734,11 +1738,11 @@ impl ExecutionScheduler {
|
||||
|
||||
async fn publish_pending_with_items_children_with_conn(
|
||||
conn: &mut PgConnection,
|
||||
publisher: &Publisher,
|
||||
parent_execution: &Execution,
|
||||
workflow_execution_id: i64,
|
||||
task_name: &str,
|
||||
slots: usize,
|
||||
pending_messages: &mut Vec<PendingExecutionRequested>,
|
||||
) -> Result<usize> {
|
||||
if slots == 0 {
|
||||
return Ok(0);
|
||||
@@ -1768,11 +1772,11 @@ impl ExecutionScheduler {
|
||||
|
||||
if let Err(e) = Self::publish_execution_requested_with_conn(
|
||||
&mut *conn,
|
||||
publisher,
|
||||
*child_id,
|
||||
*action_id,
|
||||
&child.action_ref,
|
||||
parent_execution,
|
||||
pending_messages,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -1819,12 +1823,35 @@ impl ExecutionScheduler {
|
||||
.execute(&mut *lock_conn)
|
||||
.await?;
|
||||
|
||||
let result = Self::advance_workflow_serialized(
|
||||
&mut lock_conn,
|
||||
publisher,
|
||||
round_robin_counter,
|
||||
execution,
|
||||
)
|
||||
let result = async {
|
||||
sqlx::query("BEGIN").execute(&mut *lock_conn).await?;
|
||||
|
||||
let advance_result =
|
||||
Self::advance_workflow_serialized(&mut lock_conn, round_robin_counter, execution)
|
||||
.await;
|
||||
|
||||
match advance_result {
|
||||
Ok(pending_messages) => {
|
||||
sqlx::query("COMMIT").execute(&mut *lock_conn).await?;
|
||||
|
||||
for pending in pending_messages {
|
||||
Self::publish_execution_requested_payload(publisher, pending).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Err(err) => {
|
||||
let rollback_result = sqlx::query("ROLLBACK").execute(&mut *lock_conn).await;
|
||||
if let Err(rollback_err) = rollback_result {
|
||||
error!(
|
||||
"Failed to roll back workflow_execution {} advancement transaction: {}",
|
||||
workflow_execution_id, rollback_err
|
||||
);
|
||||
}
|
||||
Err(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
.await;
|
||||
let unlock_result = sqlx::query("SELECT pg_advisory_unlock($1)")
|
||||
.bind(workflow_execution_id)
|
||||
@@ -1838,13 +1865,12 @@ impl ExecutionScheduler {
|
||||
|
||||
async fn advance_workflow_serialized(
|
||||
conn: &mut PgConnection,
|
||||
publisher: &Publisher,
|
||||
round_robin_counter: &AtomicUsize,
|
||||
execution: &Execution,
|
||||
) -> Result<()> {
|
||||
) -> Result<Vec<PendingExecutionRequested>> {
|
||||
let workflow_task = match &execution.workflow_task {
|
||||
Some(wt) => wt,
|
||||
None => return Ok(()), // Not a workflow task, nothing to do
|
||||
None => return Ok(vec![]), // Not a workflow task, nothing to do
|
||||
};
|
||||
|
||||
let workflow_execution_id = workflow_task.workflow_execution;
|
||||
@@ -1867,7 +1893,7 @@ impl ExecutionScheduler {
|
||||
|
||||
// Load the workflow execution record
|
||||
let workflow_execution =
|
||||
WorkflowExecutionRepository::find_by_id(&mut *conn, workflow_execution_id)
|
||||
WorkflowExecutionRepository::find_by_id_for_update(&mut *conn, workflow_execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!("Workflow execution {} not found", workflow_execution_id)
|
||||
@@ -1882,9 +1908,11 @@ impl ExecutionScheduler {
|
||||
"Workflow execution {} already in terminal state {:?}, skipping advance",
|
||||
workflow_execution_id, workflow_execution.status
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
let mut pending_messages = Vec::new();
|
||||
|
||||
let parent_execution =
|
||||
ExecutionRepository::find_by_id(&mut *conn, workflow_execution.execution)
|
||||
.await?
|
||||
@@ -1944,7 +1972,7 @@ impl ExecutionScheduler {
|
||||
);
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
return Ok(pending_messages);
|
||||
}
|
||||
|
||||
// Load the workflow definition so we can apply param_schema defaults
|
||||
@@ -2021,11 +2049,11 @@ impl ExecutionScheduler {
|
||||
if free_slots > 0 {
|
||||
if let Err(e) = Self::publish_pending_with_items_children_with_conn(
|
||||
&mut *conn,
|
||||
publisher,
|
||||
&parent_for_pending,
|
||||
workflow_execution_id,
|
||||
task_name,
|
||||
free_slots,
|
||||
&mut pending_messages,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -2060,7 +2088,7 @@ impl ExecutionScheduler {
|
||||
workflow_task.task_index.unwrap_or(-1),
|
||||
siblings_remaining.len(),
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(pending_messages);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
@@ -2093,7 +2121,7 @@ impl ExecutionScheduler {
|
||||
another advance_workflow call already handled final completion, skipping",
|
||||
task_name,
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(pending_messages);
|
||||
}
|
||||
|
||||
// All items done — check if any failed
|
||||
@@ -2280,13 +2308,13 @@ impl ExecutionScheduler {
|
||||
if let Some(task_node) = graph.get_task(next_task_name) {
|
||||
if let Err(e) = Self::dispatch_workflow_task_with_conn(
|
||||
&mut *conn,
|
||||
publisher,
|
||||
round_robin_counter,
|
||||
&parent_execution,
|
||||
&workflow_execution_id,
|
||||
task_node,
|
||||
&wf_ctx,
|
||||
Some(task_name), // predecessor that triggered this task
|
||||
&mut pending_messages,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -2349,7 +2377,7 @@ impl ExecutionScheduler {
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(pending_messages)
|
||||
}
|
||||
|
||||
/// Count child executions that are still in progress for a workflow.
|
||||
@@ -3139,6 +3167,7 @@ impl ExecutionScheduler {
|
||||
worker_id: &i64,
|
||||
action_ref: &str,
|
||||
config: &Option<JsonValue>,
|
||||
scheduled_attempt_updated_at: DateTime<Utc>,
|
||||
_action: &Action,
|
||||
) -> Result<()> {
|
||||
debug!("Queuing execution {} to worker {}", execution_id, worker_id);
|
||||
@@ -3149,6 +3178,7 @@ impl ExecutionScheduler {
|
||||
worker_id: *worker_id,
|
||||
action_ref: action_ref.to_string(),
|
||||
config: config.clone(),
|
||||
scheduled_attempt_updated_at,
|
||||
};
|
||||
|
||||
let envelope =
|
||||
|
||||
@@ -12,7 +12,10 @@ use anyhow::Result;
|
||||
use attune_common::{
|
||||
models::{enums::ExecutionStatus, Execution},
|
||||
mq::{MessageEnvelope, MessageType, Publisher},
|
||||
repositories::execution::SELECT_COLUMNS as EXECUTION_COLUMNS,
|
||||
repositories::{
|
||||
execution::{UpdateExecutionInput, SELECT_COLUMNS as EXECUTION_COLUMNS},
|
||||
ExecutionRepository,
|
||||
},
|
||||
};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -178,20 +181,27 @@ impl ExecutionTimeoutMonitor {
|
||||
"original_status": "scheduled"
|
||||
});
|
||||
|
||||
// Update execution status in database
|
||||
sqlx::query(
|
||||
"UPDATE execution
|
||||
SET status = $1,
|
||||
result = $2,
|
||||
updated = NOW()
|
||||
WHERE id = $3",
|
||||
let updated = ExecutionRepository::update_if_status_and_updated_before(
|
||||
&self.pool,
|
||||
execution_id,
|
||||
ExecutionStatus::Scheduled,
|
||||
self.calculate_cutoff_time(),
|
||||
UpdateExecutionInput {
|
||||
status: Some(ExecutionStatus::Failed),
|
||||
result: Some(result.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.bind(ExecutionStatus::Failed)
|
||||
.bind(&result)
|
||||
.bind(execution_id)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
||||
if updated.is_none() {
|
||||
debug!(
|
||||
"Skipping timeout failure for execution {} because it already left Scheduled or is no longer stale",
|
||||
execution_id
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!("Execution {} marked as failed in database", execution_id);
|
||||
|
||||
// Publish completion notification
|
||||
|
||||
@@ -912,6 +912,115 @@ async fn test_queue_stats_persistence() {
|
||||
cleanup_test_data(&pool, pack_id).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Requires database
|
||||
async fn test_release_restore_recovers_active_slot_and_next_queue_head() {
|
||||
let pool = setup_db().await;
|
||||
let timestamp = Utc::now().timestamp();
|
||||
let suffix = format!("restore_release_{}", timestamp);
|
||||
|
||||
let pack_id = create_test_pack(&pool, &suffix).await;
|
||||
let pack_ref = format!("fifo_test_pack_{}", suffix);
|
||||
let action_id = create_test_action(&pool, pack_id, &pack_ref, &suffix).await;
|
||||
let action_ref = format!("fifo_test_action_{}", suffix);
|
||||
|
||||
let manager = ExecutionQueueManager::with_db_pool(QueueConfig::default(), pool.clone());
|
||||
|
||||
let first =
|
||||
create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
|
||||
let second =
|
||||
create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
|
||||
let third =
|
||||
create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
|
||||
|
||||
manager.enqueue(action_id, first, 1, None).await.unwrap();
|
||||
manager.enqueue(action_id, second, 1, None).await.unwrap();
|
||||
manager.enqueue(action_id, third, 1, None).await.unwrap();
|
||||
|
||||
let stats = manager.get_queue_stats(action_id).await.unwrap();
|
||||
assert_eq!(stats.active_count, 1);
|
||||
assert_eq!(stats.queue_length, 2);
|
||||
|
||||
let release = manager
|
||||
.release_active_slot(first)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("first execution should own an active slot");
|
||||
assert_eq!(release.next_execution_id, Some(second));
|
||||
|
||||
let stats = manager.get_queue_stats(action_id).await.unwrap();
|
||||
assert_eq!(stats.active_count, 1);
|
||||
assert_eq!(stats.queue_length, 1);
|
||||
|
||||
manager.restore_active_slot(first, &release).await.unwrap();
|
||||
|
||||
let stats = manager.get_queue_stats(action_id).await.unwrap();
|
||||
assert_eq!(stats.active_count, 1);
|
||||
assert_eq!(stats.queue_length, 2);
|
||||
assert_eq!(stats.total_completed, 0);
|
||||
|
||||
let next = manager
|
||||
.release_active_slot(first)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("restored execution should still own the active slot");
|
||||
assert_eq!(next.next_execution_id, Some(second));
|
||||
|
||||
cleanup_test_data(&pool, pack_id).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Requires database
|
||||
async fn test_remove_restore_recovers_queued_execution_position() {
|
||||
let pool = setup_db().await;
|
||||
let timestamp = Utc::now().timestamp();
|
||||
let suffix = format!("restore_queue_{}", timestamp);
|
||||
|
||||
let pack_id = create_test_pack(&pool, &suffix).await;
|
||||
let pack_ref = format!("fifo_test_pack_{}", suffix);
|
||||
let action_id = create_test_action(&pool, pack_id, &pack_ref, &suffix).await;
|
||||
let action_ref = format!("fifo_test_action_{}", suffix);
|
||||
|
||||
let manager = ExecutionQueueManager::with_db_pool(QueueConfig::default(), pool.clone());
|
||||
|
||||
let first =
|
||||
create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
|
||||
let second =
|
||||
create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
|
||||
let third =
|
||||
create_test_execution(&pool, action_id, &action_ref, ExecutionStatus::Requested).await;
|
||||
|
||||
manager.enqueue(action_id, first, 1, None).await.unwrap();
|
||||
manager.enqueue(action_id, second, 1, None).await.unwrap();
|
||||
manager.enqueue(action_id, third, 1, None).await.unwrap();
|
||||
|
||||
let removal = manager
|
||||
.remove_queued_execution(second)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("second execution should be queued");
|
||||
assert_eq!(removal.next_execution_id, None);
|
||||
|
||||
let stats = manager.get_queue_stats(action_id).await.unwrap();
|
||||
assert_eq!(stats.active_count, 1);
|
||||
assert_eq!(stats.queue_length, 1);
|
||||
|
||||
manager.restore_queued_execution(&removal).await.unwrap();
|
||||
|
||||
let stats = manager.get_queue_stats(action_id).await.unwrap();
|
||||
assert_eq!(stats.active_count, 1);
|
||||
assert_eq!(stats.queue_length, 2);
|
||||
|
||||
let release = manager
|
||||
.release_active_slot(first)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("first execution should own the active slot");
|
||||
assert_eq!(release.next_execution_id, Some(second));
|
||||
|
||||
cleanup_test_data(&pool, pack_id).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Requires database
|
||||
async fn test_queue_full_rejection() {
|
||||
|
||||
Reference in New Issue
Block a user