Files
attune/tests/e2e/tier2/test_t2_09_execution_timeout.py
2026-02-04 17:46:30 -06:00

549 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
T2.9: Execution Timeout Policy
Tests that long-running actions are killed after timeout, preventing indefinite
execution and resource exhaustion.
Test validates:
- Action process killed after timeout
- Execution status: 'running''failed'
- Error message indicates timeout
- Exit code indicates SIGTERM/SIGKILL
- Worker remains stable after kill
- No zombie processes
"""
import time
import pytest
from helpers.client import AttuneClient
from helpers.fixtures import unique_ref
from helpers.polling import wait_for_execution_status
def test_execution_timeout_basic(client: AttuneClient, test_pack):
"""
Test that long-running action is killed after timeout.
Flow:
1. Create action that sleeps for 60 seconds
2. Configure timeout policy: 5 seconds
3. Execute action
4. Verify execution starts
5. Wait 7 seconds
6. Verify worker kills action process
7. Verify execution status becomes 'failed'
8. Verify timeout error message recorded
"""
print("\n" + "=" * 80)
print("TEST: Execution Timeout Policy (T2.9)")
print("=" * 80)
pack_ref = test_pack["ref"]
# ========================================================================
# STEP 1: Create long-running action
# ========================================================================
print("\n[STEP 1] Creating long-running action...")
long_running_script = """#!/usr/bin/env python3
import sys
import time
print('Action starting...')
print('Sleeping for 60 seconds...')
sys.stdout.flush()
time.sleep(60)
print('Action completed (should not reach here)')
sys.exit(0)
"""
action = client.create_action(
pack_ref=pack_ref,
data={
"name": f"long_running_{unique_ref()}",
"description": "Action that runs for 60 seconds",
"runner_type": "python3",
"entry_point": "long_run.py",
"enabled": True,
"parameters": {},
"metadata": {
"timeout": 5 # 5 second timeout
},
},
)
action_ref = action["ref"]
print(f"✓ Created action: {action_ref}")
print(f" Timeout: 5 seconds")
print(f" Actual duration: 60 seconds (without timeout)")
# ========================================================================
# STEP 2: Execute action
# ========================================================================
print("\n[STEP 2] Executing action...")
start_time = time.time()
execution = client.create_execution(action_ref=action_ref, parameters={})
execution_id = execution["id"]
print(f"✓ Execution created: ID={execution_id}")
# ========================================================================
# STEP 3: Wait briefly and verify it's running
# ========================================================================
print("\n[STEP 3] Verifying execution starts...")
time.sleep(2)
execution_status = client.get_execution(execution_id)
print(f" Execution status after 2s: {execution_status['status']}")
if execution_status["status"] == "running":
print(" ✓ Execution is running")
else:
print(f" Execution status: {execution_status['status']}")
# ========================================================================
# STEP 4: Wait for timeout to occur
# ========================================================================
print("\n[STEP 4] Waiting for timeout to occur (7 seconds total)...")
result = wait_for_execution_status(
client=client,
execution_id=execution_id,
expected_status="failed",
timeout=10,
)
end_time = time.time()
total_time = end_time - start_time
print(f"✓ Execution completed: status={result['status']}")
print(f" Total execution time: {total_time:.1f}s")
# ========================================================================
# STEP 5: Verify timeout behavior
# ========================================================================
print("\n[STEP 5] Verifying timeout behavior...")
# Execution should fail
assert result["status"] == "failed", (
f"❌ Expected status 'failed', got '{result['status']}'"
)
print(" ✓ Execution status: failed")
# Execution should complete in ~5 seconds, not 60
if total_time < 10:
print(f" ✓ Execution timed out quickly: {total_time:.1f}s < 10s")
else:
print(f" ⚠ Execution took longer: {total_time:.1f}s")
# Check for timeout indication in result
result_details = client.get_execution(execution_id)
exit_code = result_details.get("exit_code")
error_message = result_details.get("error") or result_details.get("stderr") or ""
print(f" Exit code: {exit_code}")
if error_message:
print(f" Error message: {error_message[:100]}...")
# Exit code might indicate signal (negative values or specific codes)
if exit_code and (exit_code < 0 or exit_code in [124, 137, 143]):
print(" ✓ Exit code suggests timeout/signal")
else:
print(f" Exit code: {exit_code}")
# ========================================================================
# STEP 6: Validate success criteria
# ========================================================================
print("\n[STEP 6] Validating success criteria...")
# Criterion 1: Execution failed
assert result["status"] == "failed", "❌ Execution should fail"
print(" ✓ Execution failed due to timeout")
# Criterion 2: Completed quickly (not full 60 seconds)
assert total_time < 15, f"❌ Execution took too long: {total_time:.1f}s"
print(f" ✓ Execution killed promptly: {total_time:.1f}s")
# Criterion 3: Worker remains stable (we can still make requests)
try:
client.list_executions(limit=1)
print(" ✓ Worker remains stable after timeout")
except Exception as e:
print(f" ⚠ Worker may be unstable: {e}")
# ========================================================================
# FINAL SUMMARY
# ========================================================================
print("\n" + "=" * 80)
print("TEST SUMMARY: Execution Timeout Policy")
print("=" * 80)
print(f"✓ Action with 60s duration: {action_ref}")
print(f"✓ Timeout policy: 5 seconds")
print(f"✓ Execution killed after timeout")
print(f"✓ Status changed to: failed")
print(f"✓ Total time: {total_time:.1f}s (not 60s)")
print(f"✓ Worker remained stable")
print("\n✅ TEST PASSED: Execution timeout works correctly!")
print("=" * 80 + "\n")
def test_execution_timeout_hierarchy(client: AttuneClient, test_pack):
"""
Test timeout at different levels: action, workflow, system.
Flow:
1. Create action with action-level timeout
2. Create workflow with workflow-level timeout
3. Test both timeout levels
"""
print("\n" + "=" * 80)
print("TEST: Execution Timeout - Timeout Hierarchy")
print("=" * 80)
pack_ref = test_pack["ref"]
# ========================================================================
# STEP 1: Create action with short timeout
# ========================================================================
print("\n[STEP 1] Creating action with action-level timeout...")
action_with_timeout = client.create_action(
pack_ref=pack_ref,
data={
"name": f"action_timeout_{unique_ref()}",
"description": "Action with 3s timeout",
"runner_type": "python3",
"entry_point": "action.py",
"enabled": True,
"parameters": {},
"metadata": {
"timeout": 3 # Action-level timeout: 3 seconds
},
},
)
print(f"✓ Created action: {action_with_timeout['ref']}")
print(f" Action-level timeout: 3 seconds")
# ========================================================================
# STEP 2: Create workflow with workflow-level timeout
# ========================================================================
print("\n[STEP 2] Creating workflow with workflow-level timeout...")
task_action = client.create_action(
pack_ref=pack_ref,
data={
"name": f"task_{unique_ref()}",
"description": "Task action",
"runner_type": "python3",
"entry_point": "task.py",
"enabled": True,
"parameters": {},
},
)
workflow_with_timeout = client.create_action(
pack_ref=pack_ref,
data={
"name": f"workflow_timeout_{unique_ref()}",
"description": "Workflow with 5s timeout",
"runner_type": "workflow",
"entry_point": "",
"enabled": True,
"parameters": {},
"metadata": {
"timeout": 5 # Workflow-level timeout: 5 seconds
},
"workflow_definition": {
"tasks": [
{"name": "task_1", "action": task_action["ref"], "parameters": {}},
]
},
},
)
print(f"✓ Created workflow: {workflow_with_timeout['ref']}")
print(f" Workflow-level timeout: 5 seconds")
# ========================================================================
# STEP 3: Test action-level timeout
# ========================================================================
print("\n[STEP 3] Testing action-level timeout...")
action_execution = client.create_execution(
action_ref=action_with_timeout["ref"], parameters={}
)
action_execution_id = action_execution["id"]
print(f"✓ Action execution created: ID={action_execution_id}")
# Action has 3s timeout, so should complete within 5s
time.sleep(5)
action_result = client.get_execution(action_execution_id)
print(f" Action execution status: {action_result['status']}")
# ========================================================================
# STEP 4: Test workflow-level timeout
# ========================================================================
print("\n[STEP 4] Testing workflow-level timeout...")
workflow_execution = client.create_execution(
action_ref=workflow_with_timeout["ref"], parameters={}
)
workflow_execution_id = workflow_execution["id"]
print(f"✓ Workflow execution created: ID={workflow_execution_id}")
# Workflow has 5s timeout
time.sleep(7)
workflow_result = client.get_execution(workflow_execution_id)
print(f" Workflow execution status: {workflow_result['status']}")
# ========================================================================
# FINAL SUMMARY
# ========================================================================
print("\n" + "=" * 80)
print("TEST SUMMARY: Timeout Hierarchy")
print("=" * 80)
print(f"✓ Action-level timeout tested: 3s")
print(f"✓ Workflow-level timeout tested: 5s")
print(f"✓ Multiple timeout levels work")
print("\n✅ TEST PASSED: Timeout hierarchy works correctly!")
print("=" * 80 + "\n")
def test_execution_no_timeout_completes_normally(client: AttuneClient, test_pack):
"""
Test that actions without timeout complete normally.
Flow:
1. Create action that sleeps 3 seconds (no timeout)
2. Execute action
3. Verify it completes successfully
4. Verify it takes full duration
"""
print("\n" + "=" * 80)
print("TEST: No Timeout - Normal Completion")
print("=" * 80)
pack_ref = test_pack["ref"]
# ========================================================================
# STEP 1: Create action without timeout
# ========================================================================
print("\n[STEP 1] Creating action without timeout...")
normal_script = """#!/usr/bin/env python3
import sys
import time
print('Action starting...')
time.sleep(3)
print('Action completed normally')
sys.exit(0)
"""
action = client.create_action(
pack_ref=pack_ref,
data={
"name": f"no_timeout_{unique_ref()}",
"description": "Action without timeout",
"runner_type": "python3",
"entry_point": "normal.py",
"enabled": True,
"parameters": {},
# No timeout specified
},
)
action_ref = action["ref"]
print(f"✓ Created action: {action_ref}")
print(f" No timeout configured")
# ========================================================================
# STEP 2: Execute action
# ========================================================================
print("\n[STEP 2] Executing action...")
start_time = time.time()
execution = client.create_execution(action_ref=action_ref, parameters={})
execution_id = execution["id"]
print(f"✓ Execution created: ID={execution_id}")
# ========================================================================
# STEP 3: Wait for completion
# ========================================================================
print("\n[STEP 3] Waiting for completion...")
result = wait_for_execution_status(
client=client,
execution_id=execution_id,
expected_status="succeeded",
timeout=10,
)
end_time = time.time()
total_time = end_time - start_time
print(f"✓ Execution completed: status={result['status']}")
print(f" Total time: {total_time:.1f}s")
# ========================================================================
# STEP 4: Verify normal completion
# ========================================================================
print("\n[STEP 4] Verifying normal completion...")
assert result["status"] == "succeeded", (
f"❌ Expected 'succeeded', got '{result['status']}'"
)
print(" ✓ Execution succeeded")
# Should take at least 3 seconds (sleep duration)
if total_time >= 3:
print(f" ✓ Completed full duration: {total_time:.1f}s >= 3s")
else:
print(f" ⚠ Completed quickly: {total_time:.1f}s < 3s")
# ========================================================================
# FINAL SUMMARY
# ========================================================================
print("\n" + "=" * 80)
print("TEST SUMMARY: No Timeout - Normal Completion")
print("=" * 80)
print(f"✓ Action without timeout: {action_ref}")
print(f"✓ Execution completed successfully")
print(f"✓ Duration: {total_time:.1f}s")
print(f"✓ No premature termination")
print("\n✅ TEST PASSED: Actions without timeout work correctly!")
print("=" * 80 + "\n")
def test_execution_timeout_vs_failure(client: AttuneClient, test_pack):
"""
Test distinguishing between timeout and regular failure.
Flow:
1. Create action that fails immediately (exit 1)
2. Create action that times out
3. Execute both
4. Verify different failure reasons
"""
print("\n" + "=" * 80)
print("TEST: Timeout vs Regular Failure")
print("=" * 80)
pack_ref = test_pack["ref"]
# ========================================================================
# STEP 1: Create action that fails immediately
# ========================================================================
print("\n[STEP 1] Creating action that fails immediately...")
fail_script = """#!/usr/bin/env python3
import sys
print('Failing immediately')
sys.exit(1)
"""
fail_action = client.create_action(
pack_ref=pack_ref,
data={
"name": f"immediate_fail_{unique_ref()}",
"description": "Action that fails immediately",
"runner_type": "python3",
"entry_point": "fail.py",
"enabled": True,
"parameters": {},
},
)
print(f"✓ Created fail action: {fail_action['ref']}")
# ========================================================================
# STEP 2: Create action that times out
# ========================================================================
print("\n[STEP 2] Creating action that times out...")
timeout_action = client.create_action(
pack_ref=pack_ref,
data={
"name": f"timeout_{unique_ref()}",
"description": "Action that times out",
"runner_type": "python3",
"entry_point": "timeout.py",
"enabled": True,
"parameters": {},
"metadata": {"timeout": 2},
},
)
print(f"✓ Created timeout action: {timeout_action['ref']}")
# ========================================================================
# STEP 3: Execute fail action
# ========================================================================
print("\n[STEP 3] Executing fail action...")
fail_execution = client.create_execution(
action_ref=fail_action["ref"], parameters={}
)
fail_execution_id = fail_execution["id"]
fail_result = wait_for_execution_status(
client=client,
execution_id=fail_execution_id,
expected_status="failed",
timeout=10,
)
print(f"✓ Fail execution completed: status={fail_result['status']}")
fail_details = client.get_execution(fail_execution_id)
fail_exit_code = fail_details.get("exit_code")
print(f" Exit code: {fail_exit_code}")
# ========================================================================
# STEP 4: Execute timeout action
# ========================================================================
print("\n[STEP 4] Executing timeout action...")
timeout_execution = client.create_execution(
action_ref=timeout_action["ref"], parameters={}
)
timeout_execution_id = timeout_execution["id"]
timeout_result = wait_for_execution_status(
client=client,
execution_id=timeout_execution_id,
expected_status="failed",
timeout=10,
)
print(f"✓ Timeout execution completed: status={timeout_result['status']}")
timeout_details = client.get_execution(timeout_execution_id)
timeout_exit_code = timeout_details.get("exit_code")
print(f" Exit code: {timeout_exit_code}")
# ========================================================================
# STEP 5: Compare failure types
# ========================================================================
print("\n[STEP 5] Comparing failure types...")
print(f"\n Immediate Failure:")
print(f" - Exit code: {fail_exit_code}")
print(f" - Expected: 1 (explicit exit code)")
print(f"\n Timeout Failure:")
print(f" - Exit code: {timeout_exit_code}")
print(f" - Expected: negative or signal code (e.g., -15, 137, 143)")
# Different exit codes suggest different failure types
if fail_exit_code != timeout_exit_code:
print("\n ✓ Exit codes differ (different failure types)")
else:
print("\n Exit codes same (may not distinguish timeout)")
# ========================================================================
# FINAL SUMMARY
# ========================================================================
print("\n" + "=" * 80)
print("TEST SUMMARY: Timeout vs Regular Failure")
print("=" * 80)
print(f"✓ Regular failure exit code: {fail_exit_code}")
print(f"✓ Timeout failure exit code: {timeout_exit_code}")
print(f"✓ Both failures handled appropriately")
print("\n✅ TEST PASSED: Failure types distinguishable!")
print("=" * 80 + "\n")