-- Phase 3: Retry Tracking and Action Timeout Configuration -- This migration adds support for: -- 1. Retry tracking on executions (attempt count, max attempts, retry reason) -- 2. Action-level timeout configuration -- 3. Worker health metrics -- Add retry tracking fields to execution table ALTER TABLE execution ADD COLUMN retry_count INTEGER NOT NULL DEFAULT 0, ADD COLUMN max_retries INTEGER, ADD COLUMN retry_reason TEXT, ADD COLUMN original_execution BIGINT REFERENCES execution(id) ON DELETE SET NULL; -- Add index for finding retry chains CREATE INDEX idx_execution_original_execution ON execution(original_execution) WHERE original_execution IS NOT NULL; -- Add timeout configuration to action table ALTER TABLE action ADD COLUMN timeout_seconds INTEGER, ADD COLUMN max_retries INTEGER DEFAULT 0; -- Add comment explaining timeout behavior COMMENT ON COLUMN action.timeout_seconds IS 'Worker queue TTL override in seconds. If NULL, uses global worker_queue_ttl_ms config. Allows per-action timeout tuning.'; COMMENT ON COLUMN action.max_retries IS 'Maximum number of automatic retry attempts for failed executions. 0 = no retries (default).'; COMMENT ON COLUMN execution.retry_count IS 'Current retry attempt number (0 = first attempt, 1 = first retry, etc.)'; COMMENT ON COLUMN execution.max_retries IS 'Maximum retries for this execution. Copied from action.max_retries at creation time.'; COMMENT ON COLUMN execution.retry_reason IS 'Reason for retry (e.g., "worker_unavailable", "transient_error", "manual_retry")'; COMMENT ON COLUMN execution.original_execution IS 'ID of the original execution if this is a retry. Forms a retry chain.'; -- Add worker health tracking fields -- These are stored in the capabilities JSONB field as a "health" object: -- { -- "runtimes": [...], -- "health": { -- "status": "healthy|degraded|unhealthy", -- "last_check": "2026-02-09T12:00:00Z", -- "consecutive_failures": 0, -- "total_executions": 100, -- "failed_executions": 2, -- "average_execution_time_ms": 1500, -- "queue_depth": 5 -- } -- } -- Add index for health-based queries (using JSONB path operators) CREATE INDEX idx_worker_capabilities_health_status ON worker USING GIN ((capabilities -> 'health' -> 'status')); -- Add view for healthy workers (convenience for queries) CREATE OR REPLACE VIEW healthy_workers AS SELECT w.id, w.name, w.worker_type, w.worker_role, w.runtime, w.status, w.capabilities, w.last_heartbeat, (w.capabilities -> 'health' ->> 'status')::TEXT as health_status, (w.capabilities -> 'health' ->> 'queue_depth')::INTEGER as queue_depth, (w.capabilities -> 'health' ->> 'consecutive_failures')::INTEGER as consecutive_failures FROM worker w WHERE w.status = 'active' AND w.last_heartbeat > NOW() - INTERVAL '30 seconds' AND ( -- Healthy if no health info (backward compatible) w.capabilities -> 'health' IS NULL OR -- Or explicitly marked healthy w.capabilities -> 'health' ->> 'status' IN ('healthy', 'degraded') ); COMMENT ON VIEW healthy_workers IS 'Workers that are active, have fresh heartbeat, and are healthy or degraded (not unhealthy)'; -- Add function to get worker queue depth estimate CREATE OR REPLACE FUNCTION get_worker_queue_depth(worker_id_param BIGINT) RETURNS INTEGER AS $$ BEGIN -- Extract queue depth from capabilities.health.queue_depth -- Returns NULL if not available RETURN ( SELECT (capabilities -> 'health' ->> 'queue_depth')::INTEGER FROM worker WHERE id = worker_id_param ); END; $$ LANGUAGE plpgsql STABLE; COMMENT ON FUNCTION get_worker_queue_depth IS 'Extract current queue depth from worker health metadata'; -- Add function to check if execution is retriable CREATE OR REPLACE FUNCTION is_execution_retriable(execution_id_param BIGINT) RETURNS BOOLEAN AS $$ DECLARE exec_record RECORD; BEGIN SELECT e.retry_count, e.max_retries, e.status INTO exec_record FROM execution e WHERE e.id = execution_id_param; IF NOT FOUND THEN RETURN FALSE; END IF; -- Can retry if: -- 1. Status is failed -- 2. max_retries is set and > 0 -- 3. retry_count < max_retries RETURN ( exec_record.status = 'failed' AND exec_record.max_retries IS NOT NULL AND exec_record.max_retries > 0 AND exec_record.retry_count < exec_record.max_retries ); END; $$ LANGUAGE plpgsql STABLE; COMMENT ON FUNCTION is_execution_retriable IS 'Check if a failed execution can be automatically retried based on retry limits'; -- Add indexes for retry queries CREATE INDEX idx_execution_status_retry ON execution(status, retry_count) WHERE status = 'failed' AND retry_count < COALESCE(max_retries, 0);