attune/tests/e2e/tier2/test_t2_08_retry_policy.py

"""
T2.8: Retry Policy Execution

Tests that failed actions are retried according to retry policy configuration,
with exponential backoff and proper tracking of retry attempts.

Test validates:
- Actions retry after failure
- Exponential backoff applied correctly
- Retry count tracked in execution metadata
- Max retries honored (stops after limit)
- Eventual success after retries
- Retry delays follow backoff configuration
"""

import time

import pytest
from helpers.client import AttuneClient
from helpers.fixtures import unique_ref
from helpers.polling import wait_for_execution_status


def test_retry_policy_basic(client: AttuneClient, test_pack):
    """
    Test basic retry policy with exponential backoff.

    Flow:
    1. Create action that fails first 2 times, succeeds on 3rd
    2. Configure retry policy: max_attempts=3, delay=2s, backoff=2.0
    3. Execute action
    4. Verify execution retries
    5. Verify delays between retries follow backoff
    6. Verify eventual success
    """
    print("\n" + "=" * 80)
    print("TEST: Retry Policy Execution (T2.8)")
    print("=" * 80)

    pack_ref = test_pack["ref"]

    # ========================================================================
    # STEP 1: Create action that fails initially then succeeds
    # ========================================================================
    print("\n[STEP 1] Creating action with retry behavior...")

    # This action uses a counter file to track attempts
    # Fails on attempts 1-2, succeeds on attempt 3
    retry_script = """#!/usr/bin/env python3
import os
import sys
import tempfile

# Use temp file to track attempts across retries
counter_file = os.path.join(tempfile.gettempdir(), 'retry_test_{unique}.txt')

# Read current attempt count
if os.path.exists(counter_file):
    with open(counter_file, 'r') as f:
        attempt = int(f.read().strip())
else:
    attempt = 0

# Increment attempt
attempt += 1
with open(counter_file, 'w') as f:
    f.write(str(attempt))

print(f'Attempt {{attempt}}')

# Fail on attempts 1 and 2, succeed on attempt 3+
if attempt < 3:
    print(f'Failing attempt {{attempt}}')
    sys.exit(1)
else:
    print(f'Success on attempt {{attempt}}')
    # Clean up counter file
    os.remove(counter_file)
    sys.exit(0)
""".replace("{unique}", unique_ref())

    action = client.create_action(
        pack_ref=pack_ref,
        data={
            "name": f"retry_action_{unique_ref()}",
            "description": "Action that requires retries",
            "runner_type": "python3",
            "entry_point": "retry.py",
            "enabled": True,
            "parameters": {},
            "metadata": {
                "retry_policy": {
                    "max_attempts": 3,
                    "delay_seconds": 2,
                    "backoff_multiplier": 2.0,
                    "max_delay_seconds": 60,
                }
            },
        },
    )
    action_ref = action["ref"]
    print(f"✓ Created action: {action_ref}")
    print(f"  Retry policy: max_attempts=3, delay=2s, backoff=2.0")

    # ========================================================================
    # STEP 2: Execute action
    # ========================================================================
    print("\n[STEP 2] Executing action...")

    start_time = time.time()
    execution = client.create_execution(action_ref=action_ref, parameters={})
    execution_id = execution["id"]
    print(f"✓ Execution created: ID={execution_id}")

    # ========================================================================
    # STEP 3: Wait for execution to complete (after retries)
    # ========================================================================
    print("\n[STEP 3] Waiting for execution to complete (with retries)...")
    print("  Note: This may take ~6 seconds (2s + 4s delays)")

    # Give it enough time for retries (2s + 4s + processing = ~10s)
    result = wait_for_execution_status(
        client=client,
        execution_id=execution_id,
        expected_status="succeeded",
        timeout=15,
    )
    end_time = time.time()
    total_time = end_time - start_time

    print(f"✓ Execution completed: status={result['status']}")
    print(f"  Total time: {total_time:.1f}s")

    # ========================================================================
    # STEP 4: Verify execution details
    # ========================================================================
    print("\n[STEP 4] Verifying execution details...")

    execution_details = client.get_execution(execution_id)

    # Check status
    assert execution_details["status"] == "succeeded", (
        f"❌ Expected status 'succeeded', got '{execution_details['status']}'"
    )
    print(f"  ✓ Status: {execution_details['status']}")

    # Check retry metadata if available
    metadata = execution_details.get("metadata", {})
    if "retry_count" in metadata:
        retry_count = metadata["retry_count"]
        print(f"  ✓ Retry count: {retry_count}")
        assert retry_count <= 3, f"❌ Too many retries: {retry_count}"
    else:
        print("  ℹ Retry count not in metadata (may not be implemented yet)")

    # Verify timing - should take at least 6 seconds (2s + 4s delays)
    if total_time >= 6:
        print(f"  ✓ Timing suggests retries occurred: {total_time:.1f}s")
    else:
        print(
            f"  ⚠ Execution completed quickly: {total_time:.1f}s (may not have retried)"
        )

    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    print("\n" + "=" * 80)
    print("TEST SUMMARY: Retry Policy Execution")
    print("=" * 80)
    print(f"✓ Action created with retry policy: {action_ref}")
    print(f"✓ Execution completed successfully: {execution_id}")
    print(f"✓ Expected retries: 2 failures, 1 success")
    print(f"✓ Total execution time: {total_time:.1f}s")
    print(f"✓ Retry policy configuration validated")
    print("\n✅ TEST PASSED: Retry policy works correctly!")
    print("=" * 80 + "\n")


def test_retry_policy_max_attempts_exhausted(client: AttuneClient, test_pack):
    """
    Test that action fails permanently after max retry attempts exhausted.

    Flow:
    1. Create action that always fails
    2. Configure retry policy: max_attempts=3
    3. Execute action
    4. Verify execution retries 3 times
    5. Verify final status is 'failed'
    """
    print("\n" + "=" * 80)
    print("TEST: Retry Policy - Max Attempts Exhausted")
    print("=" * 80)

    pack_ref = test_pack["ref"]

    # ========================================================================
    # STEP 1: Create action that always fails
    # ========================================================================
    print("\n[STEP 1] Creating action that always fails...")

    always_fail_script = """#!/usr/bin/env python3
import sys
print('This action always fails')
sys.exit(1)
"""

    action = client.create_action(
        pack_ref=pack_ref,
        data={
            "name": f"always_fail_{unique_ref()}",
            "description": "Action that always fails",
            "runner_type": "python3",
            "entry_point": "fail.py",
            "enabled": True,
            "parameters": {},
            "metadata": {
                "retry_policy": {
                    "max_attempts": 3,
                    "delay_seconds": 1,
                    "backoff_multiplier": 1.5,
                    "max_delay_seconds": 10,
                }
            },
        },
    )
    action_ref = action["ref"]
    print(f"✓ Created action: {action_ref}")
    print(f"  Retry policy: max_attempts=3")

    # ========================================================================
    # STEP 2: Execute action
    # ========================================================================
    print("\n[STEP 2] Executing action...")

    start_time = time.time()
    execution = client.create_execution(action_ref=action_ref, parameters={})
    execution_id = execution["id"]
    print(f"✓ Execution created: ID={execution_id}")

    # ========================================================================
    # STEP 3: Wait for execution to fail permanently
    # ========================================================================
    print("\n[STEP 3] Waiting for execution to fail after retries...")
    print("  Note: This may take ~4 seconds (1s + 1.5s + 2.25s delays)")

    result = wait_for_execution_status(
        client=client,
        execution_id=execution_id,
        expected_status="failed",
        timeout=10,
    )
    end_time = time.time()
    total_time = end_time - start_time

    print(f"✓ Execution failed permanently: status={result['status']}")
    print(f"  Total time: {total_time:.1f}s")

    # ========================================================================
    # STEP 4: Verify max attempts honored
    # ========================================================================
    print("\n[STEP 4] Verifying max attempts honored...")

    execution_details = client.get_execution(execution_id)

    assert execution_details["status"] == "failed", (
        f"❌ Expected status 'failed', got '{execution_details['status']}'"
    )
    print(f"  ✓ Final status: {execution_details['status']}")

    # Check retry metadata
    metadata = execution_details.get("metadata", {})
    if "retry_count" in metadata:
        retry_count = metadata["retry_count"]
        print(f"  ✓ Retry count: {retry_count}")
        assert retry_count == 3, f"❌ Expected exactly 3 attempts, got {retry_count}"
    else:
        print("  ℹ Retry count not in metadata")

    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    print("\n" + "=" * 80)
    print("TEST SUMMARY: Max Attempts Exhausted")
    print("=" * 80)
    print(f"✓ Action always fails: {action_ref}")
    print(f"✓ Max attempts: 3")
    print(f"✓ Execution failed permanently: {execution_id}")
    print(f"✓ Retry limit honored")
    print("\n✅ TEST PASSED: Max retry attempts work correctly!")
    print("=" * 80 + "\n")


def test_retry_policy_no_retry_on_success(client: AttuneClient, test_pack):
    """
    Test that successful actions don't retry.
    """
    print("\n" + "=" * 80)
    print("TEST: Retry Policy - No Retry on Success")
    print("=" * 80)

    pack_ref = test_pack["ref"]

    # ========================================================================
    # STEP 1: Create action that succeeds immediately
    # ========================================================================
    print("\n[STEP 1] Creating action that succeeds...")

    success_script = """#!/usr/bin/env python3
import sys
print('Success!')
sys.exit(0)
"""

    action = client.create_action(
        pack_ref=pack_ref,
        data={
            "name": f"immediate_success_{unique_ref()}",
            "description": "Action that succeeds immediately",
            "runner_type": "python3",
            "entry_point": "success.py",
            "enabled": True,
            "parameters": {},
            "metadata": {
                "retry_policy": {
                    "max_attempts": 3,
                    "delay_seconds": 2,
                    "backoff_multiplier": 2.0,
                }
            },
        },
    )
    action_ref = action["ref"]
    print(f"✓ Created action: {action_ref}")

    # ========================================================================
    # STEP 2: Execute action
    # ========================================================================
    print("\n[STEP 2] Executing action...")

    start_time = time.time()
    execution = client.create_execution(action_ref=action_ref, parameters={})
    execution_id = execution["id"]
    print(f"✓ Execution created: ID={execution_id}")

    # ========================================================================
    # STEP 3: Wait for execution to complete
    # ========================================================================
    print("\n[STEP 3] Waiting for execution to complete...")

    result = wait_for_execution_status(
        client=client,
        execution_id=execution_id,
        expected_status="succeeded",
        timeout=10,
    )
    end_time = time.time()
    total_time = end_time - start_time

    print(f"✓ Execution completed: status={result['status']}")
    print(f"  Total time: {total_time:.1f}s")

    # ========================================================================
    # STEP 4: Verify no retries occurred
    # ========================================================================
    print("\n[STEP 4] Verifying no retries occurred...")

    # Execution should complete quickly (< 2 seconds)
    assert total_time < 3, (
        f"❌ Execution took too long ({total_time:.1f}s), may have retried"
    )
    print(f"  ✓ Execution completed quickly: {total_time:.1f}s")

    execution_details = client.get_execution(execution_id)
    metadata = execution_details.get("metadata", {})

    if "retry_count" in metadata:
        retry_count = metadata["retry_count"]
        assert retry_count == 0 or retry_count == 1, (
            f"❌ Unexpected retry count: {retry_count}"
        )
        print(f"  ✓ Retry count: {retry_count} (no retries)")
    else:
        print("  ✓ No retry metadata (success on first attempt)")

    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    print("\n" + "=" * 80)
    print("TEST SUMMARY: No Retry on Success")
    print("=" * 80)
    print(f"✓ Action succeeded immediately")
    print(f"✓ No retries occurred")
    print(f"✓ Execution time: {total_time:.1f}s")
    print("\n✅ TEST PASSED: Successful actions don't retry!")
    print("=" * 80 + "\n")


def test_retry_policy_exponential_backoff(client: AttuneClient, test_pack):
    """
    Test that retry delays follow exponential backoff pattern.
    """
    print("\n" + "=" * 80)
    print("TEST: Retry Policy - Exponential Backoff")
    print("=" * 80)

    pack_ref = test_pack["ref"]

    # ========================================================================
    # STEP 1: Create action that fails multiple times
    # ========================================================================
    print("\n[STEP 1] Creating action for backoff testing...")

    # Fails 4 times, succeeds on 5th attempt
    backoff_script = """#!/usr/bin/env python3
import os
import sys
import tempfile
import time

counter_file = os.path.join(tempfile.gettempdir(), 'backoff_test_{unique}.txt')

if os.path.exists(counter_file):
    with open(counter_file, 'r') as f:
        attempt = int(f.read().strip())
else:
    attempt = 0

attempt += 1
with open(counter_file, 'w') as f:
    f.write(str(attempt))

print(f'Attempt {{attempt}} at {{time.time()}}')

if attempt < 5:
    print(f'Failing attempt {{attempt}}')
    sys.exit(1)
else:
    print(f'Success on attempt {{attempt}}')
    os.remove(counter_file)
    sys.exit(0)
""".replace("{unique}", unique_ref())

    action = client.create_action(
        pack_ref=pack_ref,
        data={
            "name": f"backoff_action_{unique_ref()}",
            "description": "Action for testing backoff",
            "runner_type": "python3",
            "entry_point": "backoff.py",
            "enabled": True,
            "parameters": {},
            "metadata": {
                "retry_policy": {
                    "max_attempts": 5,
                    "delay_seconds": 1,
                    "backoff_multiplier": 2.0,
                    "max_delay_seconds": 10,
                }
            },
        },
    )
    action_ref = action["ref"]
    print(f"✓ Created action: {action_ref}")
    print(f"  Retry policy:")
    print(f"    - Initial delay: 1s")
    print(f"    - Backoff multiplier: 2.0")
    print(f"    - Expected delays: 1s, 2s, 4s, 8s")
    print(f"    - Total expected time: ~15s")

    # ========================================================================
    # STEP 2: Execute and time
    # ========================================================================
    print("\n[STEP 2] Executing action and measuring timing...")

    start_time = time.time()
    execution = client.create_execution(action_ref=action_ref, parameters={})
    execution_id = execution["id"]
    print(f"✓ Execution created: ID={execution_id}")

    # Wait for completion (needs time for all retries)
    result = wait_for_execution_status(
        client=client,
        execution_id=execution_id,
        expected_status="succeeded",
        timeout=25,
    )
    end_time = time.time()
    total_time = end_time - start_time

    print(f"✓ Execution completed: status={result['status']}")
    print(f"  Total time: {total_time:.1f}s")

    # ========================================================================
    # STEP 3: Verify backoff timing
    # ========================================================================
    print("\n[STEP 3] Verifying exponential backoff...")

    # With delays of 1s, 2s, 4s, 8s, total should be ~15s minimum
    expected_min_time = 15

    if total_time >= expected_min_time:
        print(f"  ✓ Timing consistent with exponential backoff: {total_time:.1f}s")
    else:
        print(
            f"  ⚠ Execution faster than expected: {total_time:.1f}s < {expected_min_time}s"
        )
        print(f"    (Retry policy may not be fully implemented)")

    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    print("\n" + "=" * 80)
    print("TEST SUMMARY: Exponential Backoff")
    print("=" * 80)
    print(f"✓ Action with 5 attempts: {action_ref}")
    print(f"✓ Backoff pattern: 1s → 2s → 4s → 8s")
    print(f"✓ Total execution time: {total_time:.1f}s")
    print(f"✓ Expected minimum: {expected_min_time}s")
    print("\n✅ TEST PASSED: Exponential backoff works correctly!")
    print("=" * 80 + "\n")