more internal polish, resilient workers

This commit is contained in:
2026-02-09 18:32:34 -06:00
parent 588b319fec
commit e31ecb781b
62 changed files with 9872 additions and 584 deletions

View File

@@ -347,6 +347,10 @@ pub struct WorkerConfig {
#[serde(default = "default_max_stderr_bytes")]
pub max_stderr_bytes: usize,
/// Graceful shutdown timeout in seconds
#[serde(default = "default_shutdown_timeout")]
pub shutdown_timeout: Option<u64>,
/// Enable log streaming instead of buffering
#[serde(default = "default_true")]
pub stream_logs: bool,
@@ -360,8 +364,12 @@ fn default_heartbeat_interval() -> u64 {
30
}
fn default_shutdown_timeout() -> Option<u64> {
Some(30)
}
fn default_task_timeout() -> u64 {
300
300 // 5 minutes
}
fn default_max_stdout_bytes() -> usize {
@@ -489,6 +497,32 @@ impl Default for PackRegistryConfig {
}
}
/// Executor service configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExecutorConfig {
/// How long an execution can remain in SCHEDULED status before timing out (seconds)
#[serde(default)]
pub scheduled_timeout: Option<u64>,
/// How often to check for stale executions (seconds)
#[serde(default)]
pub timeout_check_interval: Option<u64>,
/// Whether to enable the execution timeout monitor
#[serde(default)]
pub enable_timeout_monitor: Option<bool>,
}
impl Default for ExecutorConfig {
fn default() -> Self {
Self {
scheduled_timeout: Some(300), // 5 minutes
timeout_check_interval: Some(60), // 1 minute
enable_timeout_monitor: Some(true),
}
}
}
/// Main application configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Config {
@@ -540,6 +574,9 @@ pub struct Config {
/// Pack registry configuration
#[serde(default)]
pub pack_registry: PackRegistryConfig,
/// Executor configuration (optional, for executor service)
pub executor: Option<ExecutorConfig>,
}
fn default_service_name() -> String {

View File

@@ -101,6 +101,10 @@ pub struct RabbitMqConfig {
/// Dead letter queue configuration
#[serde(default)]
pub dead_letter: DeadLetterConfig,
/// Worker queue message TTL in milliseconds (default 5 minutes)
#[serde(default = "default_worker_queue_ttl")]
pub worker_queue_ttl_ms: u64,
}
impl Default for RabbitMqConfig {
@@ -123,6 +127,7 @@ impl Default for RabbitMqConfig {
queues: QueuesConfig::default(),
exchanges: ExchangesConfig::default(),
dead_letter: DeadLetterConfig::default(),
worker_queue_ttl_ms: default_worker_queue_ttl(),
}
}
}
@@ -161,6 +166,11 @@ impl RabbitMqConfig {
Duration::from_secs(self.consumer_timeout_secs)
}
/// Get worker queue TTL as Duration
pub fn worker_queue_ttl(&self) -> Duration {
Duration::from_millis(self.worker_queue_ttl_ms)
}
/// Validate configuration
pub fn validate(&self) -> MqResult<()> {
if self.host.is_empty() {
@@ -491,6 +501,10 @@ fn default_dlq_ttl() -> u64 {
86400000 // 24 hours in milliseconds
}
fn default_worker_queue_ttl() -> u64 {
300000 // 5 minutes in milliseconds
}
#[cfg(test)]
mod tests {
use super::*;
@@ -542,6 +556,13 @@ mod tests {
assert_eq!(config.ttl().as_secs(), 86400); // 24 hours
}
#[test]
fn test_worker_queue_ttl() {
let config = RabbitMqConfig::default();
assert_eq!(config.worker_queue_ttl().as_secs(), 300); // 5 minutes
assert_eq!(config.worker_queue_ttl_ms, 300000);
}
#[test]
fn test_default_queues() {
let queues = QueuesConfig::default();

View File

@@ -274,12 +274,29 @@ impl Connection {
&self,
config: &QueueConfig,
dlx_exchange: &str,
) -> MqResult<()> {
self.declare_queue_with_dlx_and_ttl(config, dlx_exchange, None)
.await
}
/// Declare a queue with dead letter exchange and optional TTL
pub async fn declare_queue_with_dlx_and_ttl(
&self,
config: &QueueConfig,
dlx_exchange: &str,
ttl_ms: Option<u64>,
) -> MqResult<()> {
let channel = self.create_channel().await?;
let ttl_info = if let Some(ttl) = ttl_ms {
format!(" and TTL {}ms", ttl)
} else {
String::new()
};
debug!(
"Declaring queue '{}' with dead letter exchange '{}'",
config.name, dlx_exchange
"Declaring queue '{}' with dead letter exchange '{}'{}",
config.name, dlx_exchange, ttl_info
);
let mut args = FieldTable::default();
@@ -288,6 +305,14 @@ impl Connection {
lapin::types::AMQPValue::LongString(dlx_exchange.into()),
);
// Add message TTL if specified
if let Some(ttl) = ttl_ms {
args.insert(
"x-message-ttl".into(),
lapin::types::AMQPValue::LongInt(ttl as i32),
);
}
channel
.queue_declare(
&config.name,
@@ -302,14 +327,14 @@ impl Connection {
.await
.map_err(|e| {
MqError::QueueDeclaration(format!(
"Failed to declare queue '{}' with DLX: {}",
config.name, e
"Failed to declare queue '{}' with DLX{}: {}",
config.name, ttl_info, e
))
})?;
info!(
"Queue '{}' declared with dead letter exchange '{}'",
config.name, dlx_exchange
"Queue '{}' declared with dead letter exchange '{}'{}",
config.name, dlx_exchange, ttl_info
);
Ok(())
}
@@ -448,7 +473,10 @@ impl Connection {
None
};
self.declare_queue_with_optional_dlx(&queue_config, dlx)
// Worker queues use TTL to expire unprocessed messages
let ttl_ms = Some(config.rabbitmq.worker_queue_ttl_ms);
self.declare_queue_with_optional_dlx_and_ttl(&queue_config, dlx, ttl_ms)
.await?;
// Bind to execution dispatch routing key
@@ -521,10 +549,28 @@ impl Connection {
&self,
config: &QueueConfig,
dlx: Option<&str>,
) -> MqResult<()> {
self.declare_queue_with_optional_dlx_and_ttl(config, dlx, None)
.await
}
/// Helper to declare queue with optional DLX and TTL
async fn declare_queue_with_optional_dlx_and_ttl(
&self,
config: &QueueConfig,
dlx: Option<&str>,
ttl_ms: Option<u64>,
) -> MqResult<()> {
if let Some(dlx_exchange) = dlx {
self.declare_queue_with_dlx(config, dlx_exchange).await
self.declare_queue_with_dlx_and_ttl(config, dlx_exchange, ttl_ms)
.await
} else {
if ttl_ms.is_some() {
warn!(
"Queue '{}' configured with TTL but no DLX - messages will be dropped",
config.name
);
}
self.declare_queue(config).await
}
}

View File

@@ -428,7 +428,7 @@ impl Update for WorkerRepository {
query.push(", updated = NOW() WHERE id = ");
query.push_bind(id);
query.push(" RETURNING id, name, worker_type, runtime, host, port, status, capabilities, meta, last_heartbeat, created, updated");
query.push(" RETURNING id, name, worker_type, worker_role, runtime, host, port, status, capabilities, meta, last_heartbeat, created, updated");
let worker = query.build_query_as::<Worker>().fetch_one(executor).await?;