Files
attune/crates/executor/src/execution_manager.rs

479 lines
18 KiB
Rust

//! Execution Manager - Handles execution orchestration and lifecycle events
//!
//! This module is responsible for:
//! - Listening for ExecutionStatusChanged messages from workers
//! - Orchestrating workflow executions (parent-child relationships)
//! - Triggering child executions when parent completes
//! - Handling execution failures and retries
//!
//! ## Ownership Model
//!
//! The Executor owns execution state until it is scheduled to a worker.
//! After scheduling, the Worker owns the state and updates the database directly.
//!
//! - **Executor owns**: Requested → Scheduling → Scheduled
//! - **Worker owns**: Running → Completed/Failed/Cancelled/Timeout
//!
//! The ExecutionManager receives status change notifications for orchestration
//! purposes (e.g., triggering child executions) but does NOT update the database.
use anyhow::Result;
use attune_common::{
models::{enums::ExecutionStatus, Execution},
mq::{
Consumer, ExecutionCancelRequestedPayload, ExecutionRequestedPayload,
ExecutionStatusChangedPayload, MessageEnvelope, MessageType, Publisher,
},
repositories::{
execution::{CreateExecutionInput, ExecutionRepository, UpdateExecutionInput},
workflow::{
UpdateWorkflowExecutionInput, WorkflowDefinitionRepository, WorkflowExecutionRepository,
},
Create, FindById, Update,
},
workflow::{CancellationPolicy, WorkflowDefinition},
};
use sqlx::PgPool;
use std::sync::Arc;
use tracing::{debug, error, info, warn};
/// Execution manager that handles lifecycle and status updates
pub struct ExecutionManager {
pool: PgPool,
publisher: Arc<Publisher>,
consumer: Arc<Consumer>,
}
impl ExecutionManager {
/// Create a new execution manager
pub fn new(pool: PgPool, publisher: Arc<Publisher>, consumer: Arc<Consumer>) -> Self {
Self {
pool,
publisher,
consumer,
}
}
/// Start processing execution status messages
pub async fn start(&self) -> Result<()> {
info!("Starting execution manager");
let pool = self.pool.clone();
let publisher = self.publisher.clone();
// Use the handler pattern to consume messages
self.consumer
.consume_with_handler(
move |envelope: MessageEnvelope<ExecutionStatusChangedPayload>| {
let pool = pool.clone();
let publisher = publisher.clone();
async move {
if let Err(e) =
Self::process_status_change(&pool, &publisher, &envelope).await
{
error!("Error processing status change: {}", e);
// Return error to trigger nack with requeue
return Err(format!("Failed to process status change: {}", e).into());
}
Ok(())
}
},
)
.await?;
Ok(())
}
/// Process an execution status change message
///
/// NOTE: This method does NOT update the database. The worker is responsible
/// for updating execution state after the execution is scheduled. The executor
/// only handles orchestration logic (e.g., triggering workflow children).
async fn process_status_change(
pool: &PgPool,
publisher: &Publisher,
envelope: &MessageEnvelope<ExecutionStatusChangedPayload>,
) -> Result<()> {
debug!("Processing execution status change: {:?}", envelope);
let execution_id = envelope.payload.execution_id;
let status_str = &envelope.payload.new_status;
let status = Self::parse_execution_status(status_str)?;
debug!(
"Received status change notification for execution {}: {}",
execution_id, status_str
);
// Fetch execution from database (for orchestration logic)
let execution = ExecutionRepository::find_by_id(pool, execution_id)
.await?
.ok_or_else(|| anyhow::anyhow!("Execution not found: {}", execution_id))?;
// Handle orchestration logic based on status
// Note: Worker has already updated the database directly
match status {
ExecutionStatus::Completed | ExecutionStatus::Failed | ExecutionStatus::Cancelled => {
info!(
"Execution {} reached terminal state: {:?}, handling orchestration",
execution_id, status
);
if status == ExecutionStatus::Cancelled {
Self::handle_workflow_cancellation(pool, publisher, &execution).await?;
}
Self::handle_completion(pool, publisher, &execution).await?;
}
ExecutionStatus::Canceling => {
debug!(
"Execution {} entered canceling state; checking for workflow child cancellation",
execution_id
);
Self::handle_workflow_cancellation(pool, publisher, &execution).await?;
}
ExecutionStatus::Running => {
debug!(
"Execution {} now running (worker has updated DB)",
execution_id
);
}
_ => {
debug!(
"Execution {} status changed to {:?} (no orchestration needed)",
execution_id, status
);
}
}
Ok(())
}
async fn handle_workflow_cancellation(
pool: &PgPool,
publisher: &Publisher,
execution: &Execution,
) -> Result<()> {
let Some(_) = WorkflowExecutionRepository::find_by_execution(pool, execution.id).await?
else {
return Ok(());
};
let policy = Self::resolve_cancellation_policy(pool, execution.id).await;
Self::cancel_workflow_children_with_policy(pool, publisher, execution.id, policy).await
}
async fn resolve_cancellation_policy(
pool: &PgPool,
parent_execution_id: i64,
) -> CancellationPolicy {
let wf_exec =
match WorkflowExecutionRepository::find_by_execution(pool, parent_execution_id).await {
Ok(Some(wf)) => wf,
_ => return CancellationPolicy::default(),
};
let wf_def =
match WorkflowDefinitionRepository::find_by_id(pool, wf_exec.workflow_def).await {
Ok(Some(def)) => def,
_ => return CancellationPolicy::default(),
};
match serde_json::from_value::<WorkflowDefinition>(wf_def.definition) {
Ok(def) => def.cancellation_policy,
Err(e) => {
warn!(
"Failed to deserialize workflow definition for workflow_def {}: {}. Falling back to default cancellation policy.",
wf_exec.workflow_def, e
);
CancellationPolicy::default()
}
}
}
async fn cancel_workflow_children_with_policy(
pool: &PgPool,
publisher: &Publisher,
parent_execution_id: i64,
policy: CancellationPolicy,
) -> Result<()> {
let children: Vec<Execution> = sqlx::query_as::<_, Execution>(&format!(
"SELECT {} FROM execution WHERE parent = $1 AND status NOT IN ('completed', 'failed', 'timeout', 'cancelled', 'abandoned')",
attune_common::repositories::execution::SELECT_COLUMNS
))
.bind(parent_execution_id)
.fetch_all(pool)
.await?;
if children.is_empty() {
return Self::finalize_cancelled_workflow_if_idle(pool, parent_execution_id).await;
}
info!(
"Executor cascading cancellation from workflow execution {} to {} child execution(s) with policy {:?}",
parent_execution_id,
children.len(),
policy,
);
for child in &children {
let child_id = child.id;
if matches!(
child.status,
ExecutionStatus::Requested
| ExecutionStatus::Scheduling
| ExecutionStatus::Scheduled
) {
let update = UpdateExecutionInput {
status: Some(ExecutionStatus::Cancelled),
result: Some(serde_json::json!({
"error": "Cancelled: parent workflow execution was cancelled"
})),
..Default::default()
};
ExecutionRepository::update(pool, child_id, update).await?;
} else if matches!(
child.status,
ExecutionStatus::Running | ExecutionStatus::Canceling
) {
match policy {
CancellationPolicy::CancelRunning => {
if child.status != ExecutionStatus::Canceling {
let update = UpdateExecutionInput {
status: Some(ExecutionStatus::Canceling),
..Default::default()
};
ExecutionRepository::update(pool, child_id, update).await?;
}
if let Some(worker_id) = child.worker {
Self::send_cancel_to_worker(publisher, child_id, worker_id).await?;
} else {
warn!(
"Workflow child execution {} is {:?} but has no assigned worker",
child_id, child.status
);
}
}
CancellationPolicy::AllowFinish => {
info!(
"AllowFinish policy: leaving running workflow child execution {} alone",
child_id
);
}
}
}
Box::pin(Self::cancel_workflow_children_with_policy(
pool, publisher, child_id, policy,
))
.await?;
}
if let Some(wf_exec) =
WorkflowExecutionRepository::find_by_execution(pool, parent_execution_id).await?
{
if !matches!(
wf_exec.status,
ExecutionStatus::Completed | ExecutionStatus::Failed | ExecutionStatus::Cancelled
) {
let wf_update = UpdateWorkflowExecutionInput {
status: Some(ExecutionStatus::Cancelled),
error_message: Some(
"Cancelled: parent workflow execution was cancelled".to_string(),
),
current_tasks: Some(vec![]),
..Default::default()
};
WorkflowExecutionRepository::update(pool, wf_exec.id, wf_update).await?;
}
}
Self::finalize_cancelled_workflow_if_idle(pool, parent_execution_id).await
}
async fn finalize_cancelled_workflow_if_idle(
pool: &PgPool,
parent_execution_id: i64,
) -> Result<()> {
let still_running: Vec<Execution> = sqlx::query_as::<_, Execution>(&format!(
"SELECT {} FROM execution WHERE parent = $1 AND status IN ('running', 'canceling', 'scheduling', 'scheduled', 'requested')",
attune_common::repositories::execution::SELECT_COLUMNS
))
.bind(parent_execution_id)
.fetch_all(pool)
.await?;
if still_running.is_empty() {
let update = UpdateExecutionInput {
status: Some(ExecutionStatus::Cancelled),
result: Some(serde_json::json!({
"error": "Workflow cancelled",
"succeeded": false,
})),
..Default::default()
};
let _ = ExecutionRepository::update(pool, parent_execution_id, update).await?;
}
Ok(())
}
async fn send_cancel_to_worker(
publisher: &Publisher,
execution_id: i64,
worker_id: i64,
) -> Result<()> {
let payload = ExecutionCancelRequestedPayload {
execution_id,
worker_id,
};
let envelope = MessageEnvelope::new(MessageType::ExecutionCancelRequested, payload)
.with_source("executor-service")
.with_correlation_id(uuid::Uuid::new_v4());
publisher
.publish_envelope_with_routing(
&envelope,
"attune.executions",
&format!("execution.cancel.worker.{}", worker_id),
)
.await?;
Ok(())
}
/// Parse execution status from string
fn parse_execution_status(status: &str) -> Result<ExecutionStatus> {
match status.to_lowercase().as_str() {
"requested" => Ok(ExecutionStatus::Requested),
"scheduling" => Ok(ExecutionStatus::Scheduling),
"scheduled" => Ok(ExecutionStatus::Scheduled),
"running" => Ok(ExecutionStatus::Running),
"completed" => Ok(ExecutionStatus::Completed),
"failed" => Ok(ExecutionStatus::Failed),
"cancelled" | "canceled" => Ok(ExecutionStatus::Cancelled),
"canceling" => Ok(ExecutionStatus::Canceling),
"abandoned" => Ok(ExecutionStatus::Abandoned),
"timeout" => Ok(ExecutionStatus::Timeout),
_ => Err(anyhow::anyhow!("Invalid execution status: {}", status)),
}
}
/// Handle execution completion (success, failure, or cancellation)
async fn handle_completion(
pool: &PgPool,
publisher: &Publisher,
execution: &Execution,
) -> Result<()> {
info!("Handling completion for execution: {}", execution.id);
// Check if this execution has child executions to trigger
if let Some(child_actions) = Self::get_child_actions(execution).await? {
// Only trigger children on completion
if execution.status == ExecutionStatus::Completed {
Self::trigger_child_executions(pool, publisher, execution, &child_actions).await?;
} else {
warn!(
"Execution {} failed/canceled, skipping child executions",
execution.id
);
}
}
// NOTE: Completion notification is published by the worker, not here.
// This prevents duplicate execution.completed messages that would cause
// the queue manager to decrement active_count twice.
Ok(())
}
/// Get child actions from execution result (for workflow orchestration)
async fn get_child_actions(_execution: &Execution) -> Result<Option<Vec<String>>> {
// TODO: Implement workflow logic
// - Check if action has defined workflow
// - Extract next actions from execution result
// - Parse workflow definition
// For now, return None (no child executions)
Ok(None)
}
/// Trigger child executions for a completed parent
async fn trigger_child_executions(
pool: &PgPool,
publisher: &Publisher,
parent: &Execution,
child_actions: &[String],
) -> Result<()> {
info!(
"Triggering {} child executions for parent: {}",
child_actions.len(),
parent.id
);
for action_ref in child_actions {
let child_input = CreateExecutionInput {
action: None,
action_ref: action_ref.clone(),
config: parent.config.clone(), // Pass parent config to child
env_vars: parent.env_vars.clone(), // Pass parent env vars to child
parent: Some(parent.id), // Link to parent execution
enforcement: parent.enforcement,
executor: None, // Will be assigned during scheduling
worker: None,
status: ExecutionStatus::Requested,
result: None,
workflow_task: None, // Non-workflow execution
};
let child_execution = ExecutionRepository::create(pool, child_input).await?;
info!(
"Created child execution {} for parent {}",
child_execution.id, parent.id
);
// Publish ExecutionRequested message for child
let payload = ExecutionRequestedPayload {
execution_id: child_execution.id,
action_id: None, // Child executions typically don't have action_id set yet
action_ref: action_ref.clone(),
parent_id: Some(parent.id),
enforcement_id: None,
config: None,
};
let envelope = MessageEnvelope::new(MessageType::ExecutionRequested, payload)
.with_source("executor");
publisher.publish_envelope(&envelope).await?;
}
Ok(())
}
// REMOVED: publish_completion_notification
// This method was causing duplicate execution.completed messages.
// The worker is responsible for publishing completion notifications,
// not the executor. Removing this prevents double-decrementing the
// queue manager's active_count.
}
#[cfg(test)]
mod tests {
#[test]
fn test_execution_manager_creation() {
// This is a placeholder test
// Real tests will require database and message queue setup
}
#[test]
fn test_parse_execution_status() {
// Mock pool, publisher, consumer for testing
// In real tests, these would be properly initialized
}
}