re-uploading work
This commit is contained in:
303
crates/executor/src/scheduler.rs
Normal file
303
crates/executor/src/scheduler.rs
Normal file
@@ -0,0 +1,303 @@
|
||||
//! Execution Scheduler - Routes executions to available workers
|
||||
//!
|
||||
//! This module is responsible for:
|
||||
//! - Listening for ExecutionRequested messages
|
||||
//! - Selecting appropriate workers for executions
|
||||
//! - Queuing executions to worker-specific queues
|
||||
//! - Updating execution status to Scheduled
|
||||
//! - Handling worker unavailability and retries
|
||||
|
||||
use anyhow::Result;
|
||||
use attune_common::{
|
||||
models::{enums::ExecutionStatus, Action, Execution},
|
||||
mq::{Consumer, ExecutionRequestedPayload, MessageEnvelope, MessageType, Publisher},
|
||||
repositories::{
|
||||
action::ActionRepository,
|
||||
execution::ExecutionRepository,
|
||||
runtime::{RuntimeRepository, WorkerRepository},
|
||||
FindById, FindByRef, Update,
|
||||
},
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
use sqlx::PgPool;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
/// Payload for execution scheduled messages
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct ExecutionScheduledPayload {
|
||||
execution_id: i64,
|
||||
worker_id: i64,
|
||||
action_ref: String,
|
||||
config: Option<JsonValue>,
|
||||
}
|
||||
|
||||
/// Execution scheduler that routes executions to workers
|
||||
pub struct ExecutionScheduler {
|
||||
pool: PgPool,
|
||||
publisher: Arc<Publisher>,
|
||||
consumer: Arc<Consumer>,
|
||||
}
|
||||
|
||||
impl ExecutionScheduler {
|
||||
/// Create a new execution scheduler
|
||||
pub fn new(pool: PgPool, publisher: Arc<Publisher>, consumer: Arc<Consumer>) -> Self {
|
||||
Self {
|
||||
pool,
|
||||
publisher,
|
||||
consumer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Start processing execution requested messages
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
info!("Starting execution scheduler");
|
||||
|
||||
let pool = self.pool.clone();
|
||||
let publisher = self.publisher.clone();
|
||||
|
||||
// Use the handler pattern to consume messages
|
||||
self.consumer
|
||||
.consume_with_handler(
|
||||
move |envelope: MessageEnvelope<ExecutionRequestedPayload>| {
|
||||
let pool = pool.clone();
|
||||
let publisher = publisher.clone();
|
||||
|
||||
async move {
|
||||
if let Err(e) =
|
||||
Self::process_execution_requested(&pool, &publisher, &envelope).await
|
||||
{
|
||||
error!("Error scheduling execution: {}", e);
|
||||
// Return error to trigger nack with requeue
|
||||
return Err(format!("Failed to schedule execution: {}", e).into());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process an execution requested message
|
||||
async fn process_execution_requested(
|
||||
pool: &PgPool,
|
||||
publisher: &Publisher,
|
||||
envelope: &MessageEnvelope<ExecutionRequestedPayload>,
|
||||
) -> Result<()> {
|
||||
debug!("Processing execution requested message: {:?}", envelope);
|
||||
|
||||
let execution_id = envelope.payload.execution_id;
|
||||
|
||||
info!("Scheduling execution: {}", execution_id);
|
||||
|
||||
// Fetch execution from database
|
||||
let mut execution = ExecutionRepository::find_by_id(pool, execution_id)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Execution not found: {}", execution_id))?;
|
||||
|
||||
// Fetch action to determine runtime requirements
|
||||
let action = Self::get_action_for_execution(pool, &execution).await?;
|
||||
|
||||
// Select appropriate worker
|
||||
let worker = Self::select_worker(pool, &action).await?;
|
||||
|
||||
info!(
|
||||
"Selected worker {} for execution {}",
|
||||
worker.id, execution_id
|
||||
);
|
||||
|
||||
// Update execution status to scheduled
|
||||
let execution_config = execution.config.clone();
|
||||
execution.status = ExecutionStatus::Scheduled;
|
||||
ExecutionRepository::update(pool, execution.id, execution.into()).await?;
|
||||
|
||||
// Publish message to worker-specific queue
|
||||
Self::queue_to_worker(
|
||||
publisher,
|
||||
&execution_id,
|
||||
&worker.id,
|
||||
&envelope.payload.action_ref,
|
||||
&execution_config,
|
||||
&action,
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!(
|
||||
"Execution {} scheduled to worker {}",
|
||||
execution_id, worker.id
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the action associated with an execution
|
||||
async fn get_action_for_execution(pool: &PgPool, execution: &Execution) -> Result<Action> {
|
||||
// Try to get action by ID first
|
||||
if let Some(action_id) = execution.action {
|
||||
if let Some(action) = ActionRepository::find_by_id(pool, action_id).await? {
|
||||
return Ok(action);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to action_ref
|
||||
ActionRepository::find_by_ref(pool, &execution.action_ref)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Action not found for execution: {}", execution.id))
|
||||
}
|
||||
|
||||
/// Select an appropriate worker for the execution
|
||||
async fn select_worker(
|
||||
pool: &PgPool,
|
||||
action: &Action,
|
||||
) -> Result<attune_common::models::Worker> {
|
||||
// Get runtime requirements for the action
|
||||
let runtime = if let Some(runtime_id) = action.runtime {
|
||||
RuntimeRepository::find_by_id(pool, runtime_id).await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Find available action workers (role = 'action')
|
||||
let workers = WorkerRepository::find_action_workers(pool).await?;
|
||||
|
||||
if workers.is_empty() {
|
||||
return Err(anyhow::anyhow!("No action workers available"));
|
||||
}
|
||||
|
||||
// Filter workers by runtime compatibility if runtime is specified
|
||||
let compatible_workers: Vec<_> = if let Some(ref runtime) = runtime {
|
||||
workers
|
||||
.into_iter()
|
||||
.filter(|w| Self::worker_supports_runtime(w, &runtime.name))
|
||||
.collect()
|
||||
} else {
|
||||
workers
|
||||
};
|
||||
|
||||
if compatible_workers.is_empty() {
|
||||
let runtime_name = runtime.as_ref().map(|r| r.name.as_str()).unwrap_or("any");
|
||||
return Err(anyhow::anyhow!(
|
||||
"No compatible workers found for action: {} (requires runtime: {})",
|
||||
action.r#ref,
|
||||
runtime_name
|
||||
));
|
||||
}
|
||||
|
||||
// Filter by worker status (only active workers)
|
||||
let active_workers: Vec<_> = compatible_workers
|
||||
.into_iter()
|
||||
.filter(|w| w.status == Some(attune_common::models::enums::WorkerStatus::Active))
|
||||
.collect();
|
||||
|
||||
if active_workers.is_empty() {
|
||||
return Err(anyhow::anyhow!("No active workers available"));
|
||||
}
|
||||
|
||||
// TODO: Implement intelligent worker selection:
|
||||
// - Consider worker load/capacity
|
||||
// - Consider worker affinity (same pack, same runtime)
|
||||
// - Consider geographic locality
|
||||
// - Round-robin or least-connections strategy
|
||||
|
||||
// For now, just select the first available worker
|
||||
Ok(active_workers
|
||||
.into_iter()
|
||||
.next()
|
||||
.expect("Worker list should not be empty"))
|
||||
}
|
||||
|
||||
/// Check if a worker supports a given runtime
|
||||
///
|
||||
/// This checks the worker's capabilities.runtimes array for the runtime name.
|
||||
/// Falls back to checking the deprecated runtime column if capabilities are not set.
|
||||
fn worker_supports_runtime(worker: &attune_common::models::Worker, runtime_name: &str) -> bool {
|
||||
// First, try to parse capabilities and check runtimes array
|
||||
if let Some(ref capabilities) = worker.capabilities {
|
||||
if let Some(runtimes) = capabilities.get("runtimes") {
|
||||
if let Some(runtime_array) = runtimes.as_array() {
|
||||
// Check if any runtime in the array matches (case-insensitive)
|
||||
for runtime_value in runtime_array {
|
||||
if let Some(runtime_str) = runtime_value.as_str() {
|
||||
if runtime_str.eq_ignore_ascii_case(runtime_name) {
|
||||
debug!(
|
||||
"Worker {} supports runtime '{}' via capabilities",
|
||||
worker.name, runtime_name
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: check deprecated runtime column
|
||||
// This is kept for backward compatibility but should be removed in the future
|
||||
if worker.runtime.is_some() {
|
||||
debug!(
|
||||
"Worker {} using deprecated runtime column for matching",
|
||||
worker.name
|
||||
);
|
||||
// Note: This fallback is incomplete because we'd need to look up the runtime name
|
||||
// from the ID, which would require an async call. Since we're moving to capabilities,
|
||||
// we'll just return false here and require workers to set capabilities properly.
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Worker {} does not support runtime '{}'",
|
||||
worker.name, runtime_name
|
||||
);
|
||||
false
|
||||
}
|
||||
|
||||
/// Queue execution to a specific worker
|
||||
async fn queue_to_worker(
|
||||
publisher: &Publisher,
|
||||
execution_id: &i64,
|
||||
worker_id: &i64,
|
||||
action_ref: &str,
|
||||
config: &Option<JsonValue>,
|
||||
_action: &Action,
|
||||
) -> Result<()> {
|
||||
debug!("Queuing execution {} to worker {}", execution_id, worker_id);
|
||||
|
||||
// Create payload for worker
|
||||
let payload = ExecutionScheduledPayload {
|
||||
execution_id: *execution_id,
|
||||
worker_id: *worker_id,
|
||||
action_ref: action_ref.to_string(),
|
||||
config: config.clone(),
|
||||
};
|
||||
|
||||
let envelope =
|
||||
MessageEnvelope::new(MessageType::ExecutionRequested, payload).with_source("executor");
|
||||
|
||||
// Publish to worker-specific queue with routing key
|
||||
let routing_key = format!("worker.{}", worker_id);
|
||||
let exchange = "attune.executions";
|
||||
|
||||
publisher
|
||||
.publish_envelope_with_routing(&envelope, exchange, &routing_key)
|
||||
.await?;
|
||||
|
||||
info!(
|
||||
"Published execution.scheduled message to worker {} (routing key: {})",
|
||||
worker_id, routing_key
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_scheduler_creation() {
|
||||
// This is a placeholder test
|
||||
// Real tests will require database and message queue setup
|
||||
assert!(true);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user