llama-stack-mirror/tests/integration/instrumentation/test_otel_e2e.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

"""
End-to-end tests for the OpenTelemetry inline provider.

What this does:
- Boots mock OTLP and mock vLLM
- Starts a real Llama Stack with inline OTel
- Calls real HTTP APIs
- Verifies traces, metrics, and custom metric names (non-empty)
"""

# ============================================================================
# IMPORTS
# ============================================================================

import os
import socket
import subprocess
import time
from typing import Any

import pytest
import requests
import yaml
from pydantic import BaseModel, Field

# Mock servers are in the mocking/ subdirectory
from .mocking import (
    MockOTLPCollector,
    MockServerConfig,
    MockVLLMServer,
    start_mock_servers_async,
    stop_mock_servers,
)

# ============================================================================
# DATA MODELS
# ============================================================================


class TelemetryTestCase(BaseModel):
    """
    Pydantic model defining expected telemetry for an API call.

    **TO ADD A NEW TEST CASE:** Add to TEST_CASES list below.
    """

    name: str = Field(description="Unique test case identifier")
    http_method: str = Field(description="HTTP method (GET, POST, etc.)")
    api_path: str = Field(description="API path (e.g., '/v1/models')")
    request_body: dict[str, Any] | None = Field(default=None)
    expected_http_status: int = Field(default=200)
    expected_trace_exports: int = Field(default=1, description="Minimum number of trace exports expected")
    expected_metric_exports: int = Field(default=0, description="Minimum number of metric exports expected")
    should_have_error_span: bool = Field(default=False)
    expected_metrics: list[str] = Field(
        default_factory=list, description="List of metric names that should be captured"
    )
    expected_min_spans: int | None = Field(
        default=None, description="If set, minimum number of spans expected in the new trace(s) generated by this test"
    )


# ============================================================================
# TEST CONFIGURATION
# **TO ADD NEW TESTS:** Add TelemetryTestCase instances here
# ============================================================================

# Custom metric names (defined in llama_stack/providers/inline/telemetry/otel/otel.py)

CUSTOM_METRICS_BASE = [
    "http.server.request.duration",
    "http.server.request.count",
]

CUSTOM_METRICS_STREAMING = [
    "http.server.streaming.duration",
    "http.server.streaming.count",
]

TEST_CASES = [
    TelemetryTestCase(
        name="models_list",
        http_method="GET",
        api_path="/v1/models",
        expected_trace_exports=1,  # Single trace with 2-3 spans (GET, http send)
        expected_metric_exports=1,  # Metrics export periodically, but we'll wait for them
        expected_metrics=[],  # First request: middleware may not be initialized yet
        expected_min_spans=2,
    ),
    TelemetryTestCase(
        name="chat_completion",
        http_method="POST",
        api_path="/v1/chat/completions",
        request_body={
            "model": "meta-llama/Llama-3.2-1B-Instruct",
            "messages": [{"role": "user", "content": "Hello!"}],
        },
        expected_trace_exports=1,  # Single trace with 4 spans (POST, http receive, 2x http send)
        expected_metric_exports=1,  # Metrics export periodically
        expected_metrics=CUSTOM_METRICS_BASE,
        expected_min_spans=3,
    ),
    TelemetryTestCase(
        name="chat_completion_streaming",
        http_method="POST",
        api_path="/v1/chat/completions",
        request_body={
            "model": "meta-llama/Llama-3.2-1B-Instruct",
            "messages": [{"role": "user", "content": "Streaming test"}],
            "stream": True,  # Enable streaming response
        },
        expected_trace_exports=1,  # Single trace with streaming spans
        expected_metric_exports=1,  # Metrics export periodically
        # Validate both base and streaming metrics with polling
        expected_metrics=CUSTOM_METRICS_BASE + CUSTOM_METRICS_STREAMING,
        expected_min_spans=4,
    ),
]


# ============================================================================
# TEST INFRASTRUCTURE
# ============================================================================


class TelemetryTestRunner:
    """
    Executes TelemetryTestCase instances against real Llama Stack.

    **HOW IT WORKS:**
    1. Makes real HTTP request to the stack
    2. Waits for telemetry export
    3. Verifies exports were sent to mock collector
    4. Validates custom metrics by name (if expected_metrics is specified)
    5. Ensures metrics have non-empty data points
    """

    def __init__(
        self,
        base_url: str,
        collector: MockOTLPCollector,
        poll_timeout_seconds: float = 8.0,
        poll_interval_seconds: float = 0.1,
    ):
        self.base_url = base_url
        self.collector = collector
        self.poll_timeout_seconds = poll_timeout_seconds  # how long to wait for telemetry to be exported
        self.poll_interval_seconds = poll_interval_seconds  # how often to poll for telemetry

    def run_test_case(self, test_case: TelemetryTestCase, verbose: bool = False) -> bool:
        """Execute a single test case and verify telemetry."""
        initial_traces = self.collector.get_trace_count()
        prior_trace_ids = self.collector.get_all_trace_ids()
        initial_metrics = self.collector.get_metric_count()

        if verbose:
            print(f"\n--- {test_case.name} ---")
            print(f"  {test_case.http_method} {test_case.api_path}")
            if test_case.expected_metrics:
                print(f"  Expected custom metrics: {', '.join(test_case.expected_metrics)}")

        # Make real HTTP request to Llama Stack
        is_streaming_test = test_case.request_body and test_case.request_body.get("stream", False)
        try:
            url = f"{self.base_url}{test_case.api_path}"

            # Streaming requests need longer timeout to complete
            timeout = 10 if is_streaming_test else 5

            if test_case.http_method == "GET":
                response = requests.get(url, timeout=timeout)
            elif test_case.http_method == "POST":
                response = requests.post(url, json=test_case.request_body or {}, timeout=timeout)
            else:
                response = requests.request(test_case.http_method, url, timeout=timeout)

            if verbose:
                print(f"  HTTP Response: {response.status_code}")

            status_match = response.status_code == test_case.expected_http_status

        except requests.exceptions.RequestException as e:
            if verbose:
                print(f"  Request exception: {type(e).__name__}")
            # For streaming requests, exceptions are expected due to mock server behavior
            # The important part is whether telemetry metrics were captured
            status_match = is_streaming_test  # Pass streaming tests, fail non-streaming

        # Poll until all telemetry expectations are met or timeout (single loop for speed)
        missing_metrics: list[str] = []
        empty_metrics: list[str] = []
        new_trace_ids: set[str] = set()

        def compute_status() -> tuple[bool, bool, bool, bool]:
            traces_ok_local = (self.collector.get_trace_count() - initial_traces) >= test_case.expected_trace_exports
            metrics_count_ok_local = (
                self.collector.get_metric_count() - initial_metrics
            ) >= test_case.expected_metric_exports

            metrics_ok_local = True
            if test_case.expected_metrics:
                missing_metrics.clear()
                empty_metrics.clear()
                for metric_name in test_case.expected_metrics:
                    if not self.collector.has_metric(metric_name):
                        missing_metrics.append(metric_name)
                    else:
                        data_points = self.collector.get_metric_by_name(metric_name)
                        if len(data_points) == 0:
                            empty_metrics.append(metric_name)
                metrics_ok_local = len(missing_metrics) == 0 and len(empty_metrics) == 0

            spans_ok_local = True
            if test_case.expected_min_spans is not None:
                nonlocal new_trace_ids
                new_trace_ids = self.collector.get_new_trace_ids(prior_trace_ids)
                if not new_trace_ids:
                    spans_ok_local = False
                else:
                    counts = self.collector.get_trace_span_counts()
                    min_spans: int = int(test_case.expected_min_spans or 0)
                    spans_ok_local = all(counts.get(tid, 0) >= min_spans for tid in new_trace_ids)

            return traces_ok_local, metrics_count_ok_local, metrics_ok_local, spans_ok_local

        # Poll until all telemetry expectations are met or timeout (single loop for speed)
        start = time.time()
        traces_ok, metrics_count_ok, metrics_by_name_validated, spans_ok = compute_status()
        while time.time() - start < self.poll_timeout_seconds:
            if traces_ok and metrics_count_ok and metrics_by_name_validated and spans_ok:
                break
            time.sleep(self.poll_interval_seconds)
            traces_ok, metrics_count_ok, metrics_by_name_validated, spans_ok = compute_status()

        if verbose:
            total_http_requests = len(getattr(self.collector, "all_http_requests", []))
            print(f"  [DEBUG] OTLP POST requests: {total_http_requests}")
            print(
                f"  Expected: >={test_case.expected_trace_exports} traces, >={test_case.expected_metric_exports} metrics"
            )
            print(
                f"  Actual: {self.collector.get_trace_count() - initial_traces} traces, {self.collector.get_metric_count() - initial_metrics} metrics"
            )

            if test_case.expected_metrics:
                print("  Custom metrics:")
                for metric_name in test_case.expected_metrics:
                    n = len(self.collector.get_metric_by_name(metric_name))
                    status = "✓" if n > 0 else "✗"
                    print(f"    {status} {metric_name}: {n}")
                if missing_metrics:
                    print(f"  Missing: {missing_metrics}")
                if empty_metrics:
                    print(f"  Empty: {empty_metrics}")

            if test_case.expected_min_spans is not None:
                counts = self.collector.get_trace_span_counts()
                span_counts = {tid: counts[tid] for tid in new_trace_ids}
                print(f"  New trace IDs: {sorted(new_trace_ids)}")
                print(f"  Span counts: {span_counts}")

            result = bool(
                (status_match or is_streaming_test)
                and traces_ok
                and metrics_count_ok
                and metrics_by_name_validated
                and spans_ok
            )
            print(f"  Result: {'PASS' if result else 'FAIL'}")

        return bool(
            (status_match or is_streaming_test)
            and traces_ok
            and metrics_count_ok
            and metrics_by_name_validated
            and spans_ok
        )

    def run_all_test_cases(self, test_cases: list[TelemetryTestCase], verbose: bool = True) -> dict[str, bool]:
        """Run all test cases and return results."""
        results = {}
        for test_case in test_cases:
            results[test_case.name] = self.run_test_case(test_case, verbose=verbose)
        return results


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================


def is_port_available(port: int) -> bool:
    """Check if a TCP port is available for binding."""
    try:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            sock.bind(("localhost", port))
            return True
    except OSError:
        return False


# ============================================================================
# PYTEST FIXTURES
# ============================================================================


@pytest.fixture(scope="module")
def mock_servers():
    """
    Fixture: Start all mock servers in parallel using async harness.

    **TO ADD A NEW MOCK SERVER:**
    Just add a MockServerConfig to the MOCK_SERVERS list below.
    """
    import asyncio

    # ========================================================================
    # MOCK SERVER CONFIGURATION
    # **TO ADD A NEW MOCK:** Just add a MockServerConfig instance below
    #
    # Example:
    #   MockServerConfig(
    #       name="Mock MyService",
    #       server_class=MockMyService,  # Must inherit from MockServerBase
    #       init_kwargs={"port": 9000, "param": "value"},
    #   ),
    # ========================================================================
    mock_servers_config = [
        MockServerConfig(
            name="Mock OTLP Collector",
            server_class=MockOTLPCollector,
            init_kwargs={"port": 4318},
        ),
        MockServerConfig(
            name="Mock vLLM Server",
            server_class=MockVLLMServer,
            init_kwargs={
                "port": 8000,
                "models": ["meta-llama/Llama-3.2-1B-Instruct"],
            },
        ),
        # Add more mock servers here - they will start in parallel automatically!
    ]

    # Start all servers in parallel
    servers = asyncio.run(start_mock_servers_async(mock_servers_config))

    # Verify vLLM models
    models_response = requests.get("http://localhost:8000/v1/models", timeout=1)
    models_data = models_response.json()
    print(f"[INFO] Mock vLLM serving {len(models_data['data'])} models: {[m['id'] for m in models_data['data']]}")

    yield servers

    # Stop all servers
    stop_mock_servers(servers)


@pytest.fixture(scope="module")
def mock_otlp_collector(mock_servers):
    """Convenience fixture to get OTLP collector from mock_servers."""
    return mock_servers["Mock OTLP Collector"]


@pytest.fixture(scope="module")
def mock_vllm_server(mock_servers):
    """Convenience fixture to get vLLM server from mock_servers."""
    return mock_servers["Mock vLLM Server"]


@pytest.fixture(scope="module")
def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
    """
    Fixture: Start real Llama Stack server with inline OTel provider.

    **THIS IS THE MAIN FIXTURE** - it runs:
        opentelemetry-instrument llama stack run --config run.yaml

    **TO MODIFY STACK CONFIG:** Edit run_config dict below
    """
    config_dir = tmp_path_factory.mktemp("otel-stack-config")

    # Ensure mock vLLM is ready and accessible before starting Llama Stack
    print("\n[INFO] Verifying mock vLLM is accessible at http://localhost:8000...")
    try:
        vllm_models = requests.get("http://localhost:8000/v1/models", timeout=2)
        print(f"[INFO] Mock vLLM models endpoint response: {vllm_models.status_code}")
    except Exception as e:
        pytest.fail(f"Mock vLLM not accessible before starting Llama Stack: {e}")

    # Create run.yaml with inference and telemetry providers
    run_config = {
        "image_name": "test-otel-e2e",
        "apis": ["inference"],
        "providers": {
            "inference": [
                {
                    "provider_id": "vllm",
                    "provider_type": "remote::vllm",
                    "config": {
                        "url": "http://localhost:8000/v1",
                    },
                },
            ],
        },
        "instrumentation": {
            "provider": "otel",  # Discriminator for Pydantic
            "config": {
                "service_name": "llama-stack-e2e-test",
                "span_processor": "simple",
            },
        },
        "server": {
            "host": "127.0.0.1",
        },
        "models": [
            {
                "model_id": "meta-llama/Llama-3.2-1B-Instruct",
                "provider_id": "vllm",
            }
        ],
    }

    config_file = config_dir / "run.yaml"
    with open(config_file, "w") as f:
        yaml.dump(run_config, f)

    # Find available port for Llama Stack
    port = 5555
    while not is_port_available(port) and port < 5600:
        port += 1

    if port >= 5600:
        pytest.skip("No available ports for test server")

    # Set environment variables for OTel instrumentation
    # NOTE: These only affect the subprocess, not other tests
    env = os.environ.copy()
    env["OTEL_EXPORTER_OTLP_ENDPOINT"] = "http://localhost:4318"
    env["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"  # Ensure correct protocol
    env["OTEL_SERVICE_NAME"] = "llama-stack-e2e-test"
    env["OTEL_SPAN_PROCESSOR"] = "simple"  # Force simple processor for immediate export
    env["LLAMA_STACK_PORT"] = str(port)
    env["OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED"] = "true"

    # Configure fast metric export for testing (default is 60 seconds)
    # This makes metrics export every 500ms instead of every 60 seconds
    env["OTEL_METRIC_EXPORT_INTERVAL"] = "500"  # milliseconds
    env["OTEL_METRIC_EXPORT_TIMEOUT"] = "1000"  # milliseconds

    # Disable inference recording to ensure real requests to our mock vLLM
    # This is critical - without this, Llama Stack replays cached responses
    # Safe to remove here as it only affects the subprocess environment
    if "LLAMA_STACK_TEST_INFERENCE_MODE" in env:
        del env["LLAMA_STACK_TEST_INFERENCE_MODE"]

    # Start server with automatic instrumentation
    cmd = [
        "opentelemetry-instrument",  # ← Automatic instrumentation wrapper
        "llama",
        "stack",
        "run",
        str(config_file),
        "--port",
        str(port),
    ]

    print(f"\n[INFO] Starting Llama Stack with OTel instrumentation on port {port}")
    print(f"[INFO] Command: {' '.join(cmd)}")

    process = subprocess.Popen(
        cmd,
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,  # Merge stderr into stdout
        text=True,
    )

    # Wait for server to start
    max_wait = 30
    base_url = f"http://127.0.0.1:{port}"
    startup_output = []

    for i in range(max_wait):
        # Collect server output non-blocking
        import select

        if process.stdout and select.select([process.stdout], [], [], 0)[0]:
            line = process.stdout.readline()
            if line:
                startup_output.append(line)

        try:
            response = requests.get(f"{base_url}/v1/health", timeout=1)
            if response.status_code == 200:
                print(f"[INFO] Server ready at {base_url}")
                # Print relevant initialization logs
                print(f"[DEBUG] Captured {len(startup_output)} lines of server output")
                relevant_logs = [
                    line
                    for line in startup_output
                    if any(keyword in line.lower() for keyword in ["telemetry", "otel", "provider", "error creating"])
                ]
                if relevant_logs:
                    print("[DEBUG] Relevant server logs:")
                    for log in relevant_logs[-10:]:  # Last 10 relevant lines
                        print(f"  {log.strip()}")
                time.sleep(0.5)
                break
        except requests.exceptions.RequestException:
            if i == max_wait - 1:
                process.terminate()
                stdout, _ = process.communicate(timeout=5)
                pytest.fail(f"Server failed to start.\nOutput: {stdout}")
            time.sleep(1)

    yield {
        "base_url": base_url,
        "port": port,
        "collector": mock_otlp_collector,
        "vllm_server": mock_vllm_server,
    }

    # Cleanup
    print("\n[INFO] Stopping Llama Stack server")
    process.terminate()
    try:
        process.wait(timeout=5)
    except subprocess.TimeoutExpired:
        process.kill()


# ============================================================================
# TESTS: End-to-End with Real Stack
# **THESE RUN SLOW** - marked with @pytest.mark.slow
# **TO ADD NEW E2E TESTS:** Add methods to this class
# ============================================================================


@pytest.mark.slow
class TestOTelE2E:
    """
    End-to-end tests with real Llama Stack server.

    These tests verify the complete flow:
    - Real Llama Stack with inline OTel provider
    - Real API calls
    - Automatic trace and metric collection
    - Mock OTLP collector captures exports
    """

    def test_server_starts_with_auto_instrumentation(self, llama_stack_server):
        """Verify server starts successfully with inline OTel provider."""
        base_url = llama_stack_server["base_url"]

        # Try different health check endpoints
        health_endpoints = ["/health", "/v1/health", "/"]
        server_responding = False

        for endpoint in health_endpoints:
            try:
                response = requests.get(f"{base_url}{endpoint}", timeout=5)
                print(f"\n[DEBUG] {endpoint} -> {response.status_code}")
                if response.status_code == 200:
                    server_responding = True
                    break
            except Exception as e:
                print(f"[DEBUG] {endpoint} failed: {e}")

        assert server_responding, f"Server not responding on any endpoint at {base_url}"

        print(f"\n[PASS] Llama Stack running with OTel at {base_url}")

    def test_all_test_cases_via_runner(self, llama_stack_server):
        """
        **MAIN TEST:** Run all TelemetryTestCase instances with custom metrics validation.

        This executes all test cases defined in TEST_CASES list and validates:
        1. Traces are exported to the collector
        2. Metrics are exported to the collector
        3. Custom metrics (defined in CUSTOM_METRICS_BASE, CUSTOM_METRICS_STREAMING)
           are captured by name with non-empty data points

        Each test case specifies which metrics to validate via expected_metrics field.

        **TO ADD MORE TESTS:**
        - Add TelemetryTestCase to TEST_CASES (line ~132)
        - Reference CUSTOM_METRICS_BASE or CUSTOM_METRICS_STREAMING in expected_metrics
        - See examples in existing test cases

        **TO ADD NEW METRICS:**
        - Add metric to otel.py
        - Add metric name to CUSTOM_METRICS_BASE or CUSTOM_METRICS_STREAMING (line ~122)
        - Update test cases that should validate it
        """
        base_url = llama_stack_server["base_url"]
        collector = llama_stack_server["collector"]

        # Create test runner
        runner = TelemetryTestRunner(base_url, collector)

        # Execute all test cases (set verbose=False for cleaner output)
        results = runner.run_all_test_cases(TEST_CASES, verbose=False)

        print(f"\n{'=' * 50}\nTEST CASE SUMMARY\n{'=' * 50}")
        passed = sum(1 for p in results.values() if p)
        total = len(results)
        print(f"Passed: {passed}/{total}\n")

        failed = [name for name, ok in results.items() if not ok]
        for name, ok in results.items():
            print(f"  {'[PASS]' if ok else '[FAIL]'} {name}")

        print(f"{'=' * 50}\n")
        assert not failed, f"Some test cases failed: {failed}"