Files
attune/migrations/20260209000000_phase3_retry_and_health.sql

128 lines
4.7 KiB
PL/PgSQL

-- Phase 3: Retry Tracking and Action Timeout Configuration
-- This migration adds support for:
-- 1. Retry tracking on executions (attempt count, max attempts, retry reason)
-- 2. Action-level timeout configuration
-- 3. Worker health metrics
-- Add retry tracking fields to execution table
ALTER TABLE execution
ADD COLUMN retry_count INTEGER NOT NULL DEFAULT 0,
ADD COLUMN max_retries INTEGER,
ADD COLUMN retry_reason TEXT,
ADD COLUMN original_execution BIGINT REFERENCES execution(id) ON DELETE SET NULL;
-- Add index for finding retry chains
CREATE INDEX idx_execution_original_execution ON execution(original_execution) WHERE original_execution IS NOT NULL;
-- Add timeout configuration to action table
ALTER TABLE action
ADD COLUMN timeout_seconds INTEGER,
ADD COLUMN max_retries INTEGER DEFAULT 0;
-- Add comment explaining timeout behavior
COMMENT ON COLUMN action.timeout_seconds IS 'Worker queue TTL override in seconds. If NULL, uses global worker_queue_ttl_ms config. Allows per-action timeout tuning.';
COMMENT ON COLUMN action.max_retries IS 'Maximum number of automatic retry attempts for failed executions. 0 = no retries (default).';
COMMENT ON COLUMN execution.retry_count IS 'Current retry attempt number (0 = first attempt, 1 = first retry, etc.)';
COMMENT ON COLUMN execution.max_retries IS 'Maximum retries for this execution. Copied from action.max_retries at creation time.';
COMMENT ON COLUMN execution.retry_reason IS 'Reason for retry (e.g., "worker_unavailable", "transient_error", "manual_retry")';
COMMENT ON COLUMN execution.original_execution IS 'ID of the original execution if this is a retry. Forms a retry chain.';
-- Add worker health tracking fields
-- These are stored in the capabilities JSONB field as a "health" object:
-- {
-- "runtimes": [...],
-- "health": {
-- "status": "healthy|degraded|unhealthy",
-- "last_check": "2026-02-09T12:00:00Z",
-- "consecutive_failures": 0,
-- "total_executions": 100,
-- "failed_executions": 2,
-- "average_execution_time_ms": 1500,
-- "queue_depth": 5
-- }
-- }
-- Add index for health-based queries (using JSONB path operators)
CREATE INDEX idx_worker_capabilities_health_status ON worker
USING GIN ((capabilities -> 'health' -> 'status'));
-- Add view for healthy workers (convenience for queries)
CREATE OR REPLACE VIEW healthy_workers AS
SELECT
w.id,
w.name,
w.worker_type,
w.worker_role,
w.runtime,
w.status,
w.capabilities,
w.last_heartbeat,
(w.capabilities -> 'health' ->> 'status')::TEXT as health_status,
(w.capabilities -> 'health' ->> 'queue_depth')::INTEGER as queue_depth,
(w.capabilities -> 'health' ->> 'consecutive_failures')::INTEGER as consecutive_failures
FROM worker w
WHERE
w.status = 'active'
AND w.last_heartbeat > NOW() - INTERVAL '30 seconds'
AND (
-- Healthy if no health info (backward compatible)
w.capabilities -> 'health' IS NULL
OR
-- Or explicitly marked healthy
w.capabilities -> 'health' ->> 'status' IN ('healthy', 'degraded')
);
COMMENT ON VIEW healthy_workers IS 'Workers that are active, have fresh heartbeat, and are healthy or degraded (not unhealthy)';
-- Add function to get worker queue depth estimate
CREATE OR REPLACE FUNCTION get_worker_queue_depth(worker_id_param BIGINT)
RETURNS INTEGER AS $$
BEGIN
-- Extract queue depth from capabilities.health.queue_depth
-- Returns NULL if not available
RETURN (
SELECT (capabilities -> 'health' ->> 'queue_depth')::INTEGER
FROM worker
WHERE id = worker_id_param
);
END;
$$ LANGUAGE plpgsql STABLE;
COMMENT ON FUNCTION get_worker_queue_depth IS 'Extract current queue depth from worker health metadata';
-- Add function to check if execution is retriable
CREATE OR REPLACE FUNCTION is_execution_retriable(execution_id_param BIGINT)
RETURNS BOOLEAN AS $$
DECLARE
exec_record RECORD;
BEGIN
SELECT
e.retry_count,
e.max_retries,
e.status
INTO exec_record
FROM execution e
WHERE e.id = execution_id_param;
IF NOT FOUND THEN
RETURN FALSE;
END IF;
-- Can retry if:
-- 1. Status is failed
-- 2. max_retries is set and > 0
-- 3. retry_count < max_retries
RETURN (
exec_record.status = 'failed'
AND exec_record.max_retries IS NOT NULL
AND exec_record.max_retries > 0
AND exec_record.retry_count < exec_record.max_retries
);
END;
$$ LANGUAGE plpgsql STABLE;
COMMENT ON FUNCTION is_execution_retriable IS 'Check if a failed execution can be automatically retried based on retry limits';
-- Add indexes for retry queries
CREATE INDEX idx_execution_status_retry ON execution(status, retry_count) WHERE status = 'failed' AND retry_count < COALESCE(max_retries, 0);