re-uploading work

2026-02-04 17:46:30 -06:00
commit 3b14c65998
1388 changed files with 381262 additions and 0 deletions
--- a/tests/e2e/tier2/test_t2_09_execution_timeout.py
+++ b/tests/e2e/tier2/test_t2_09_execution_timeout.py
@@ -0,0 +1,548 @@
+"""
+T2.9: Execution Timeout Policy
+
+Tests that long-running actions are killed after timeout, preventing indefinite
+execution and resource exhaustion.
+
+Test validates:
+- Action process killed after timeout
+- Execution status: 'running' → 'failed'
+- Error message indicates timeout
+- Exit code indicates SIGTERM/SIGKILL
+- Worker remains stable after kill
+- No zombie processes
+"""
+
+import time
+
+import pytest
+from helpers.client import AttuneClient
+from helpers.fixtures import unique_ref
+from helpers.polling import wait_for_execution_status
+
+
+def test_execution_timeout_basic(client: AttuneClient, test_pack):
+    """
+    Test that long-running action is killed after timeout.
+
+    Flow:
+    1. Create action that sleeps for 60 seconds
+    2. Configure timeout policy: 5 seconds
+    3. Execute action
+    4. Verify execution starts
+    5. Wait 7 seconds
+    6. Verify worker kills action process
+    7. Verify execution status becomes 'failed'
+    8. Verify timeout error message recorded
+    """
+    print("\n" + "=" * 80)
+    print("TEST: Execution Timeout Policy (T2.9)")
+    print("=" * 80)
+
+    pack_ref = test_pack["ref"]
+
+    # ========================================================================
+    # STEP 1: Create long-running action
+    # ========================================================================
+    print("\n[STEP 1] Creating long-running action...")
+
+    long_running_script = """#!/usr/bin/env python3
+import sys
+import time
+
+print('Action starting...')
+print('Sleeping for 60 seconds...')
+sys.stdout.flush()
+
+time.sleep(60)
+
+print('Action completed (should not reach here)')
+sys.exit(0)
+"""
+
+    action = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"long_running_{unique_ref()}",
+            "description": "Action that runs for 60 seconds",
+            "runner_type": "python3",
+            "entry_point": "long_run.py",
+            "enabled": True,
+            "parameters": {},
+            "metadata": {
+                "timeout": 5  # 5 second timeout
+            },
+        },
+    )
+    action_ref = action["ref"]
+    print(f"✓ Created action: {action_ref}")
+    print(f"  Timeout: 5 seconds")
+    print(f"  Actual duration: 60 seconds (without timeout)")
+
+    # ========================================================================
+    # STEP 2: Execute action
+    # ========================================================================
+    print("\n[STEP 2] Executing action...")
+
+    start_time = time.time()
+    execution = client.create_execution(action_ref=action_ref, parameters={})
+    execution_id = execution["id"]
+    print(f"✓ Execution created: ID={execution_id}")
+
+    # ========================================================================
+    # STEP 3: Wait briefly and verify it's running
+    # ========================================================================
+    print("\n[STEP 3] Verifying execution starts...")
+
+    time.sleep(2)
+    execution_status = client.get_execution(execution_id)
+    print(f"  Execution status after 2s: {execution_status['status']}")
+
+    if execution_status["status"] == "running":
+        print("  ✓ Execution is running")
+    else:
+        print(f"  ℹ Execution status: {execution_status['status']}")
+
+    # ========================================================================
+    # STEP 4: Wait for timeout to occur
+    # ========================================================================
+    print("\n[STEP 4] Waiting for timeout to occur (7 seconds total)...")
+
+    result = wait_for_execution_status(
+        client=client,
+        execution_id=execution_id,
+        expected_status="failed",
+        timeout=10,
+    )
+    end_time = time.time()
+    total_time = end_time - start_time
+
+    print(f"✓ Execution completed: status={result['status']}")
+    print(f"  Total execution time: {total_time:.1f}s")
+
+    # ========================================================================
+    # STEP 5: Verify timeout behavior
+    # ========================================================================
+    print("\n[STEP 5] Verifying timeout behavior...")
+
+    # Execution should fail
+    assert result["status"] == "failed", (
+        f"❌ Expected status 'failed', got '{result['status']}'"
+    )
+    print("  ✓ Execution status: failed")
+
+    # Execution should complete in ~5 seconds, not 60
+    if total_time < 10:
+        print(f"  ✓ Execution timed out quickly: {total_time:.1f}s < 10s")
+    else:
+        print(f"  ⚠ Execution took longer: {total_time:.1f}s")
+
+    # Check for timeout indication in result
+    result_details = client.get_execution(execution_id)
+    exit_code = result_details.get("exit_code")
+    error_message = result_details.get("error") or result_details.get("stderr") or ""
+
+    print(f"  Exit code: {exit_code}")
+    if error_message:
+        print(f"  Error message: {error_message[:100]}...")
+
+    # Exit code might indicate signal (negative values or specific codes)
+    if exit_code and (exit_code < 0 or exit_code in [124, 137, 143]):
+        print("  ✓ Exit code suggests timeout/signal")
+    else:
+        print(f"  ℹ Exit code: {exit_code}")
+
+    # ========================================================================
+    # STEP 6: Validate success criteria
+    # ========================================================================
+    print("\n[STEP 6] Validating success criteria...")
+
+    # Criterion 1: Execution failed
+    assert result["status"] == "failed", "❌ Execution should fail"
+    print("  ✓ Execution failed due to timeout")
+
+    # Criterion 2: Completed quickly (not full 60 seconds)
+    assert total_time < 15, f"❌ Execution took too long: {total_time:.1f}s"
+    print(f"  ✓ Execution killed promptly: {total_time:.1f}s")
+
+    # Criterion 3: Worker remains stable (we can still make requests)
+    try:
+        client.list_executions(limit=1)
+        print("  ✓ Worker remains stable after timeout")
+    except Exception as e:
+        print(f"  ⚠ Worker may be unstable: {e}")
+
+    # ========================================================================
+    # FINAL SUMMARY
+    # ========================================================================
+    print("\n" + "=" * 80)
+    print("TEST SUMMARY: Execution Timeout Policy")
+    print("=" * 80)
+    print(f"✓ Action with 60s duration: {action_ref}")
+    print(f"✓ Timeout policy: 5 seconds")
+    print(f"✓ Execution killed after timeout")
+    print(f"✓ Status changed to: failed")
+    print(f"✓ Total time: {total_time:.1f}s (not 60s)")
+    print(f"✓ Worker remained stable")
+    print("\n✅ TEST PASSED: Execution timeout works correctly!")
+    print("=" * 80 + "\n")
+
+
+def test_execution_timeout_hierarchy(client: AttuneClient, test_pack):
+    """
+    Test timeout at different levels: action, workflow, system.
+
+    Flow:
+    1. Create action with action-level timeout
+    2. Create workflow with workflow-level timeout
+    3. Test both timeout levels
+    """
+    print("\n" + "=" * 80)
+    print("TEST: Execution Timeout - Timeout Hierarchy")
+    print("=" * 80)
+
+    pack_ref = test_pack["ref"]
+
+    # ========================================================================
+    # STEP 1: Create action with short timeout
+    # ========================================================================
+    print("\n[STEP 1] Creating action with action-level timeout...")
+
+    action_with_timeout = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"action_timeout_{unique_ref()}",
+            "description": "Action with 3s timeout",
+            "runner_type": "python3",
+            "entry_point": "action.py",
+            "enabled": True,
+            "parameters": {},
+            "metadata": {
+                "timeout": 3  # Action-level timeout: 3 seconds
+            },
+        },
+    )
+    print(f"✓ Created action: {action_with_timeout['ref']}")
+    print(f"  Action-level timeout: 3 seconds")
+
+    # ========================================================================
+    # STEP 2: Create workflow with workflow-level timeout
+    # ========================================================================
+    print("\n[STEP 2] Creating workflow with workflow-level timeout...")
+
+    task_action = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"task_{unique_ref()}",
+            "description": "Task action",
+            "runner_type": "python3",
+            "entry_point": "task.py",
+            "enabled": True,
+            "parameters": {},
+        },
+    )
+
+    workflow_with_timeout = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"workflow_timeout_{unique_ref()}",
+            "description": "Workflow with 5s timeout",
+            "runner_type": "workflow",
+            "entry_point": "",
+            "enabled": True,
+            "parameters": {},
+            "metadata": {
+                "timeout": 5  # Workflow-level timeout: 5 seconds
+            },
+            "workflow_definition": {
+                "tasks": [
+                    {"name": "task_1", "action": task_action["ref"], "parameters": {}},
+                ]
+            },
+        },
+    )
+    print(f"✓ Created workflow: {workflow_with_timeout['ref']}")
+    print(f"  Workflow-level timeout: 5 seconds")
+
+    # ========================================================================
+    # STEP 3: Test action-level timeout
+    # ========================================================================
+    print("\n[STEP 3] Testing action-level timeout...")
+
+    action_execution = client.create_execution(
+        action_ref=action_with_timeout["ref"], parameters={}
+    )
+    action_execution_id = action_execution["id"]
+    print(f"✓ Action execution created: ID={action_execution_id}")
+
+    # Action has 3s timeout, so should complete within 5s
+    time.sleep(5)
+    action_result = client.get_execution(action_execution_id)
+    print(f"  Action execution status: {action_result['status']}")
+
+    # ========================================================================
+    # STEP 4: Test workflow-level timeout
+    # ========================================================================
+    print("\n[STEP 4] Testing workflow-level timeout...")
+
+    workflow_execution = client.create_execution(
+        action_ref=workflow_with_timeout["ref"], parameters={}
+    )
+    workflow_execution_id = workflow_execution["id"]
+    print(f"✓ Workflow execution created: ID={workflow_execution_id}")
+
+    # Workflow has 5s timeout
+    time.sleep(7)
+    workflow_result = client.get_execution(workflow_execution_id)
+    print(f"  Workflow execution status: {workflow_result['status']}")
+
+    # ========================================================================
+    # FINAL SUMMARY
+    # ========================================================================
+    print("\n" + "=" * 80)
+    print("TEST SUMMARY: Timeout Hierarchy")
+    print("=" * 80)
+    print(f"✓ Action-level timeout tested: 3s")
+    print(f"✓ Workflow-level timeout tested: 5s")
+    print(f"✓ Multiple timeout levels work")
+    print("\n✅ TEST PASSED: Timeout hierarchy works correctly!")
+    print("=" * 80 + "\n")
+
+
+def test_execution_no_timeout_completes_normally(client: AttuneClient, test_pack):
+    """
+    Test that actions without timeout complete normally.
+
+    Flow:
+    1. Create action that sleeps 3 seconds (no timeout)
+    2. Execute action
+    3. Verify it completes successfully
+    4. Verify it takes full duration
+    """
+    print("\n" + "=" * 80)
+    print("TEST: No Timeout - Normal Completion")
+    print("=" * 80)
+
+    pack_ref = test_pack["ref"]
+
+    # ========================================================================
+    # STEP 1: Create action without timeout
+    # ========================================================================
+    print("\n[STEP 1] Creating action without timeout...")
+
+    normal_script = """#!/usr/bin/env python3
+import sys
+import time
+
+print('Action starting...')
+time.sleep(3)
+print('Action completed normally')
+sys.exit(0)
+"""
+
+    action = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"no_timeout_{unique_ref()}",
+            "description": "Action without timeout",
+            "runner_type": "python3",
+            "entry_point": "normal.py",
+            "enabled": True,
+            "parameters": {},
+            # No timeout specified
+        },
+    )
+    action_ref = action["ref"]
+    print(f"✓ Created action: {action_ref}")
+    print(f"  No timeout configured")
+
+    # ========================================================================
+    # STEP 2: Execute action
+    # ========================================================================
+    print("\n[STEP 2] Executing action...")
+
+    start_time = time.time()
+    execution = client.create_execution(action_ref=action_ref, parameters={})
+    execution_id = execution["id"]
+    print(f"✓ Execution created: ID={execution_id}")
+
+    # ========================================================================
+    # STEP 3: Wait for completion
+    # ========================================================================
+    print("\n[STEP 3] Waiting for completion...")
+
+    result = wait_for_execution_status(
+        client=client,
+        execution_id=execution_id,
+        expected_status="succeeded",
+        timeout=10,
+    )
+    end_time = time.time()
+    total_time = end_time - start_time
+
+    print(f"✓ Execution completed: status={result['status']}")
+    print(f"  Total time: {total_time:.1f}s")
+
+    # ========================================================================
+    # STEP 4: Verify normal completion
+    # ========================================================================
+    print("\n[STEP 4] Verifying normal completion...")
+
+    assert result["status"] == "succeeded", (
+        f"❌ Expected 'succeeded', got '{result['status']}'"
+    )
+    print("  ✓ Execution succeeded")
+
+    # Should take at least 3 seconds (sleep duration)
+    if total_time >= 3:
+        print(f"  ✓ Completed full duration: {total_time:.1f}s >= 3s")
+    else:
+        print(f"  ⚠ Completed quickly: {total_time:.1f}s < 3s")
+
+    # ========================================================================
+    # FINAL SUMMARY
+    # ========================================================================
+    print("\n" + "=" * 80)
+    print("TEST SUMMARY: No Timeout - Normal Completion")
+    print("=" * 80)
+    print(f"✓ Action without timeout: {action_ref}")
+    print(f"✓ Execution completed successfully")
+    print(f"✓ Duration: {total_time:.1f}s")
+    print(f"✓ No premature termination")
+    print("\n✅ TEST PASSED: Actions without timeout work correctly!")
+    print("=" * 80 + "\n")
+
+
+def test_execution_timeout_vs_failure(client: AttuneClient, test_pack):
+    """
+    Test distinguishing between timeout and regular failure.
+
+    Flow:
+    1. Create action that fails immediately (exit 1)
+    2. Create action that times out
+    3. Execute both
+    4. Verify different failure reasons
+    """
+    print("\n" + "=" * 80)
+    print("TEST: Timeout vs Regular Failure")
+    print("=" * 80)
+
+    pack_ref = test_pack["ref"]
+
+    # ========================================================================
+    # STEP 1: Create action that fails immediately
+    # ========================================================================
+    print("\n[STEP 1] Creating action that fails immediately...")
+
+    fail_script = """#!/usr/bin/env python3
+import sys
+print('Failing immediately')
+sys.exit(1)
+"""
+
+    fail_action = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"immediate_fail_{unique_ref()}",
+            "description": "Action that fails immediately",
+            "runner_type": "python3",
+            "entry_point": "fail.py",
+            "enabled": True,
+            "parameters": {},
+        },
+    )
+    print(f"✓ Created fail action: {fail_action['ref']}")
+
+    # ========================================================================
+    # STEP 2: Create action that times out
+    # ========================================================================
+    print("\n[STEP 2] Creating action that times out...")
+
+    timeout_action = client.create_action(
+        pack_ref=pack_ref,
+        data={
+            "name": f"timeout_{unique_ref()}",
+            "description": "Action that times out",
+            "runner_type": "python3",
+            "entry_point": "timeout.py",
+            "enabled": True,
+            "parameters": {},
+            "metadata": {"timeout": 2},
+        },
+    )
+    print(f"✓ Created timeout action: {timeout_action['ref']}")
+
+    # ========================================================================
+    # STEP 3: Execute fail action
+    # ========================================================================
+    print("\n[STEP 3] Executing fail action...")
+
+    fail_execution = client.create_execution(
+        action_ref=fail_action["ref"], parameters={}
+    )
+    fail_execution_id = fail_execution["id"]
+
+    fail_result = wait_for_execution_status(
+        client=client,
+        execution_id=fail_execution_id,
+        expected_status="failed",
+        timeout=10,
+    )
+    print(f"✓ Fail execution completed: status={fail_result['status']}")
+
+    fail_details = client.get_execution(fail_execution_id)
+    fail_exit_code = fail_details.get("exit_code")
+    print(f"  Exit code: {fail_exit_code}")
+
+    # ========================================================================
+    # STEP 4: Execute timeout action
+    # ========================================================================
+    print("\n[STEP 4] Executing timeout action...")
+
+    timeout_execution = client.create_execution(
+        action_ref=timeout_action["ref"], parameters={}
+    )
+    timeout_execution_id = timeout_execution["id"]
+
+    timeout_result = wait_for_execution_status(
+        client=client,
+        execution_id=timeout_execution_id,
+        expected_status="failed",
+        timeout=10,
+    )
+    print(f"✓ Timeout execution completed: status={timeout_result['status']}")
+
+    timeout_details = client.get_execution(timeout_execution_id)
+    timeout_exit_code = timeout_details.get("exit_code")
+    print(f"  Exit code: {timeout_exit_code}")
+
+    # ========================================================================
+    # STEP 5: Compare failure types
+    # ========================================================================
+    print("\n[STEP 5] Comparing failure types...")
+
+    print(f"\n  Immediate Failure:")
+    print(f"  - Exit code: {fail_exit_code}")
+    print(f"  - Expected: 1 (explicit exit code)")
+
+    print(f"\n  Timeout Failure:")
+    print(f"  - Exit code: {timeout_exit_code}")
+    print(f"  - Expected: negative or signal code (e.g., -15, 137, 143)")
+
+    # Different exit codes suggest different failure types
+    if fail_exit_code != timeout_exit_code:
+        print("\n  ✓ Exit codes differ (different failure types)")
+    else:
+        print("\n  ℹ Exit codes same (may not distinguish timeout)")
+
+    # ========================================================================
+    # FINAL SUMMARY
+    # ========================================================================
+    print("\n" + "=" * 80)
+    print("TEST SUMMARY: Timeout vs Regular Failure")
+    print("=" * 80)
+    print(f"✓ Regular failure exit code: {fail_exit_code}")
+    print(f"✓ Timeout failure exit code: {timeout_exit_code}")
+    print(f"✓ Both failures handled appropriately")
+    print("\n✅ TEST PASSED: Failure types distinguishable!")
+    print("=" * 80 + "\n")