llama-stack-mirror/tests/integration/telemetry/test_otel_e2e.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

"""
End-to-end integration tests for OpenTelemetry with automatic instrumentation.

HOW THIS WORKS:
1. Starts a mock OTLP collector (HTTP server) to receive telemetry
2. Starts a mock vLLM server to handle inference requests
3. Starts REAL Llama Stack with: opentelemetry-instrument llama stack run
4. Makes REAL API calls to the stack
5. Verifies telemetry was exported to the mock collector

WHERE TO MAKE CHANGES:
- Add test cases → See TEST_CASES list below (line ~70)
- Add mock servers → See MOCK_SERVERS list in mock_servers fixture (line ~200)
- Modify mock behavior → See mocking/servers.py
- Change stack config → See llama_stack_server fixture (line ~250)
- Add assertions → See TestOTelE2EWithRealServer class (line ~370)

RUNNING THE TESTS:
- Quick (mock servers only): pytest test_otel_e2e.py::TestMockServers -v
- Full E2E (slow): pytest test_otel_e2e.py::TestOTelE2EWithRealServer -v -m slow
"""

# ============================================================================
# IMPORTS
# ============================================================================

import os
import socket
import subprocess
import time
from typing import Any

import pytest
import requests
import yaml
from pydantic import BaseModel, Field

# Mock servers are in the mocking/ subdirectory
from .mocking import (
    MockOTLPCollector,
    MockServerConfig,
    MockVLLMServer,
    start_mock_servers_async,
    stop_mock_servers,
)

# ============================================================================
# DATA MODELS
# ============================================================================


class TelemetryTestCase(BaseModel):
    """
    Pydantic model defining expected telemetry for an API call.

    **TO ADD A NEW TEST CASE:** Add to TEST_CASES list below.
    """

    name: str = Field(description="Unique test case identifier")
    http_method: str = Field(description="HTTP method (GET, POST, etc.)")
    api_path: str = Field(description="API path (e.g., '/v1/models')")
    request_body: dict[str, Any] | None = Field(default=None)
    expected_http_status: int = Field(default=200)
    expected_trace_exports: int = Field(default=1, description="Minimum number of trace exports expected")
    expected_metric_exports: int = Field(default=0, description="Minimum number of metric exports expected")
    should_have_error_span: bool = Field(default=False)


# ============================================================================
# TEST CONFIGURATION
# **TO ADD NEW TESTS:** Add TelemetryTestCase instances here
# ============================================================================

TEST_CASES = [
    TelemetryTestCase(
        name="models_list",
        http_method="GET",
        api_path="/v1/models",
        expected_trace_exports=1,
        expected_metric_exports=1,  # HTTP metrics from OTel provider middleware
    ),
    TelemetryTestCase(
        name="chat_completion",
        http_method="POST",
        api_path="/v1/inference/chat_completion",
        request_body={
            "model": "meta-llama/Llama-3.2-1B-Instruct",
            "messages": [{"role": "user", "content": "Hello!"}],
        },
        expected_trace_exports=2,  # Stack request + vLLM backend call
        expected_metric_exports=1,  # HTTP metrics (duration, count, active_requests)
    ),
]


# ============================================================================
# TEST INFRASTRUCTURE
# ============================================================================


class TelemetryTestRunner:
    """
    Executes TelemetryTestCase instances against real Llama Stack.

    **HOW IT WORKS:**
    1. Makes real HTTP request to the stack
    2. Waits for telemetry export
    3. Verifies exports were sent to mock collector
    """

    def __init__(self, base_url: str, collector: MockOTLPCollector):
        self.base_url = base_url
        self.collector = collector

    def run_test_case(self, test_case: TelemetryTestCase, verbose: bool = False) -> bool:
        """Execute a single test case and verify telemetry."""
        initial_traces = self.collector.get_trace_count()
        initial_metrics = self.collector.get_metric_count()

        if verbose:
            print(f"\n--- {test_case.name} ---")
            print(f"  {test_case.http_method} {test_case.api_path}")

        # Make real HTTP request to Llama Stack
        try:
            url = f"{self.base_url}{test_case.api_path}"

            if test_case.http_method == "GET":
                response = requests.get(url, timeout=5)
            elif test_case.http_method == "POST":
                response = requests.post(url, json=test_case.request_body or {}, timeout=5)
            else:
                response = requests.request(test_case.http_method, url, timeout=5)

            if verbose:
                print(f"  HTTP Response: {response.status_code}")

            status_match = response.status_code == test_case.expected_http_status

        except requests.exceptions.RequestException as e:
            if verbose:
                print(f"  Request failed: {e}")
            status_match = False

        # Wait for automatic instrumentation to export telemetry
        # Traces export immediately, metrics export every 1 second (configured via env var)
        time.sleep(2.0)  # Wait for both traces and metrics to export

        # Verify traces were exported to mock collector
        new_traces = self.collector.get_trace_count() - initial_traces
        traces_exported = new_traces >= test_case.expected_trace_exports

        # Verify metrics were exported (if expected)
        new_metrics = self.collector.get_metric_count() - initial_metrics
        metrics_exported = new_metrics >= test_case.expected_metric_exports

        if verbose:
            print(
                f"  Expected: >={test_case.expected_trace_exports} trace exports, >={test_case.expected_metric_exports} metric exports"
            )
            print(f"  Actual: {new_traces} trace exports, {new_metrics} metric exports")
            result = status_match and traces_exported and metrics_exported
            print(f"  Result: {'PASS' if result else 'FAIL'}")

        return status_match and traces_exported and metrics_exported

    def run_all_test_cases(self, test_cases: list[TelemetryTestCase], verbose: bool = True) -> dict[str, bool]:
        """Run all test cases and return results."""
        results = {}
        for test_case in test_cases:
            results[test_case.name] = self.run_test_case(test_case, verbose=verbose)
        return results


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================


def is_port_available(port: int) -> bool:
    """Check if a TCP port is available for binding."""
    try:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            sock.bind(("localhost", port))
            return True
    except OSError:
        return False


# ============================================================================
# PYTEST FIXTURES
# ============================================================================


@pytest.fixture(scope="module")
def mock_servers():
    """
    Fixture: Start all mock servers in parallel using async harness.

    **TO ADD A NEW MOCK SERVER:**
    Just add a MockServerConfig to the MOCK_SERVERS list below.
    """
    import asyncio

    # ========================================================================
    # MOCK SERVER CONFIGURATION
    # **TO ADD A NEW MOCK:** Just add a MockServerConfig instance below
    #
    # Example:
    #   MockServerConfig(
    #       name="Mock MyService",
    #       server_class=MockMyService,  # Must inherit from MockServerBase
    #       init_kwargs={"port": 9000, "param": "value"},
    #   ),
    # ========================================================================
    mock_servers_config = [
        MockServerConfig(
            name="Mock OTLP Collector",
            server_class=MockOTLPCollector,
            init_kwargs={"port": 4318},
        ),
        MockServerConfig(
            name="Mock vLLM Server",
            server_class=MockVLLMServer,
            init_kwargs={
                "port": 8000,
                "models": ["meta-llama/Llama-3.2-1B-Instruct"],
            },
        ),
        # Add more mock servers here - they will start in parallel automatically!
    ]

    # Start all servers in parallel
    servers = asyncio.run(start_mock_servers_async(mock_servers_config))

    # Verify vLLM models
    models_response = requests.get("http://localhost:8000/v1/models", timeout=1)
    models_data = models_response.json()
    print(f"[INFO] Mock vLLM serving {len(models_data['data'])} models: {[m['id'] for m in models_data['data']]}")

    yield servers

    # Stop all servers
    stop_mock_servers(servers)


@pytest.fixture(scope="module")
def mock_otlp_collector(mock_servers):
    """Convenience fixture to get OTLP collector from mock_servers."""
    return mock_servers["Mock OTLP Collector"]


@pytest.fixture(scope="module")
def mock_vllm_server(mock_servers):
    """Convenience fixture to get vLLM server from mock_servers."""
    return mock_servers["Mock vLLM Server"]


@pytest.fixture(scope="module")
def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
    """
    Fixture: Start real Llama Stack server with automatic OTel instrumentation.

    **THIS IS THE MAIN FIXTURE** - it runs:
        opentelemetry-instrument llama stack run --config run.yaml

    **TO MODIFY STACK CONFIG:** Edit run_config dict below
    """
    config_dir = tmp_path_factory.mktemp("otel-stack-config")

    # Ensure mock vLLM is ready and accessible before starting Llama Stack
    print("\n[INFO] Verifying mock vLLM is accessible at http://localhost:8000...")
    try:
        vllm_models = requests.get("http://localhost:8000/v1/models", timeout=2)
        print(f"[INFO] Mock vLLM models endpoint response: {vllm_models.status_code}")
    except Exception as e:
        pytest.fail(f"Mock vLLM not accessible before starting Llama Stack: {e}")

    # Create run.yaml with inference provider
    # **TO ADD MORE PROVIDERS:** Add to providers dict
    run_config = {
        "image_name": "test-otel-e2e",
        "apis": ["inference"],
        "providers": {
            "inference": [
                {
                    "provider_id": "vllm",
                    "provider_type": "remote::vllm",
                    "config": {
                        "url": "http://localhost:8000/v1",
                    },
                },
            ],
        },
        "models": [
            {
                "model_id": "meta-llama/Llama-3.2-1B-Instruct",
                "provider_id": "vllm",
            }
        ],
    }

    config_file = config_dir / "run.yaml"
    with open(config_file, "w") as f:
        yaml.dump(run_config, f)

    # Find available port for Llama Stack
    port = 5555
    while not is_port_available(port) and port < 5600:
        port += 1

    if port >= 5600:
        pytest.skip("No available ports for test server")

    # Set environment variables for OTel instrumentation
    # NOTE: These only affect the subprocess, not other tests
    env = os.environ.copy()
    env["OTEL_EXPORTER_OTLP_ENDPOINT"] = "http://localhost:4318"
    env["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"  # Ensure correct protocol
    env["OTEL_SERVICE_NAME"] = "llama-stack-e2e-test"
    env["LLAMA_STACK_PORT"] = str(port)
    env["OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED"] = "true"

    # Configure fast metric export for testing (default is 60 seconds)
    # This makes metrics export every 500ms instead of every 60 seconds
    env["OTEL_METRIC_EXPORT_INTERVAL"] = "500"  # milliseconds
    env["OTEL_METRIC_EXPORT_TIMEOUT"] = "1000"  # milliseconds

    # Disable inference recording to ensure real requests to our mock vLLM
    # This is critical - without this, Llama Stack replays cached responses
    # Safe to remove here as it only affects the subprocess environment
    if "LLAMA_STACK_TEST_INFERENCE_MODE" in env:
        del env["LLAMA_STACK_TEST_INFERENCE_MODE"]

    # Start server with automatic instrumentation
    cmd = [
        "opentelemetry-instrument",  # ← Automatic instrumentation wrapper
        "llama",
        "stack",
        "run",
        str(config_file),
        "--port",
        str(port),
    ]

    print(f"\n[INFO] Starting Llama Stack with OTel instrumentation on port {port}")
    print(f"[INFO] Command: {' '.join(cmd)}")

    process = subprocess.Popen(
        cmd,
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )

    # Wait for server to start
    max_wait = 30
    base_url = f"http://localhost:{port}"

    for i in range(max_wait):
        try:
            response = requests.get(f"{base_url}/v1/health", timeout=1)
            if response.status_code == 200:
                print(f"[INFO] Server ready at {base_url}")
                break
        except requests.exceptions.RequestException:
            if i == max_wait - 1:
                process.terminate()
                stdout, stderr = process.communicate(timeout=5)
                pytest.fail(f"Server failed to start.\nStdout: {stdout}\nStderr: {stderr}")
            time.sleep(1)

    yield {
        "base_url": base_url,
        "port": port,
        "collector": mock_otlp_collector,
        "vllm_server": mock_vllm_server,
    }

    # Cleanup
    print("\n[INFO] Stopping Llama Stack server")
    process.terminate()
    try:
        process.wait(timeout=5)
    except subprocess.TimeoutExpired:
        process.kill()


# ============================================================================
# TESTS: End-to-End with Real Stack
# **THESE RUN SLOW** - marked with @pytest.mark.slow
# **TO ADD NEW E2E TESTS:** Add methods to this class
# ============================================================================


@pytest.mark.slow
class TestOTelE2E:
    """
    End-to-end tests with real Llama Stack server.

    These tests verify the complete flow:
    - Real Llama Stack with opentelemetry-instrument
    - Real API calls
    - Real automatic instrumentation
    - Mock OTLP collector captures exports
    """

    def test_server_starts_with_auto_instrumentation(self, llama_stack_server):
        """Verify server starts successfully with opentelemetry-instrument."""
        base_url = llama_stack_server["base_url"]

        # Try different health check endpoints
        health_endpoints = ["/health", "/v1/health", "/"]
        server_responding = False

        for endpoint in health_endpoints:
            try:
                response = requests.get(f"{base_url}{endpoint}", timeout=5)
                print(f"\n[DEBUG] {endpoint} -> {response.status_code}")
                if response.status_code == 200:
                    server_responding = True
                    break
            except Exception as e:
                print(f"[DEBUG] {endpoint} failed: {e}")

        assert server_responding, f"Server not responding on any endpoint at {base_url}"

        print(f"\n[PASS] Llama Stack running with OTel at {base_url}")

    def test_all_test_cases_via_runner(self, llama_stack_server):
        """
        **MAIN TEST:** Run all TelemetryTestCase instances.

        This executes all test cases defined in TEST_CASES list.
        **TO ADD MORE TESTS:** Add to TEST_CASES at top of file
        """
        base_url = llama_stack_server["base_url"]
        collector = llama_stack_server["collector"]

        # Create test runner
        runner = TelemetryTestRunner(base_url, collector)

        # Execute all test cases
        results = runner.run_all_test_cases(TEST_CASES, verbose=True)

        # Print summary
        print(f"\n{'=' * 50}")
        print("TEST CASE SUMMARY")
        print(f"{'=' * 50}")
        passed = sum(1 for p in results.values() if p)
        total = len(results)
        print(f"Passed: {passed}/{total}\n")

        for name, result in results.items():
            status = "[PASS]" if result else "[FAIL]"
            print(f"  {status} {name}")
        print(f"{'=' * 50}\n")