feat(telemetry:major): End to End Testing, Metric Capture, SQL Alchemy Injection

2025-12-15 02:52:37 +00:00 · 2025-10-03 12:17:41 -04:00 · 2025-10-03 12:17:41 -04:00 · 7e3cf1fb20
commit 7e3cf1fb20
parent e815738936
26 changed files with 2075 additions and 1006 deletions
--- a/tests/integration/telemetry/test_otel_e2e.py
+++ b/tests/integration/telemetry/test_otel_e2e.py
@ -0,0 +1,622 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+End-to-end tests for the OpenTelemetry inline provider.
+
+What this does:
+- Boots mock OTLP and mock vLLM
+- Starts a real Llama Stack with inline OTel
+- Calls real HTTP APIs
+- Verifies traces, metrics, and custom metric names (non-empty)
+"""
+
+# ============================================================================
+# IMPORTS
+# ============================================================================
+
+import os
+import socket
+import subprocess
+import time
+from typing import Any
+
+import pytest
+import requests
+import yaml
+from pydantic import BaseModel, Field
+
+# Mock servers are in the mocking/ subdirectory
+from .mocking import (
+    MockOTLPCollector,
+    MockServerConfig,
+    MockVLLMServer,
+    start_mock_servers_async,
+    stop_mock_servers,
+)
+
+# ============================================================================
+# DATA MODELS
+# ============================================================================
+
+
+class TelemetryTestCase(BaseModel):
+    """
+    Pydantic model defining expected telemetry for an API call.
+
+    **TO ADD A NEW TEST CASE:** Add to TEST_CASES list below.
+    """
+
+    name: str = Field(description="Unique test case identifier")
+    http_method: str = Field(description="HTTP method (GET, POST, etc.)")
+    api_path: str = Field(description="API path (e.g., '/v1/models')")
+    request_body: dict[str, Any] | None = Field(default=None)
+    expected_http_status: int = Field(default=200)
+    expected_trace_exports: int = Field(default=1, description="Minimum number of trace exports expected")
+    expected_metric_exports: int = Field(default=0, description="Minimum number of metric exports expected")
+    should_have_error_span: bool = Field(default=False)
+    expected_metrics: list[str] = Field(
+        default_factory=list, description="List of metric names that should be captured"
+    )
+    expected_min_spans: int | None = Field(
+        default=None, description="If set, minimum number of spans expected in the new trace(s) generated by this test"
+    )
+
+
+# ============================================================================
+# TEST CONFIGURATION
+# **TO ADD NEW TESTS:** Add TelemetryTestCase instances here
+# ============================================================================
+
+# Custom metric names (defined in llama_stack/providers/inline/telemetry/otel/otel.py)
+
+CUSTOM_METRICS_BASE = [
+    "http.server.request.duration",
+    "http.server.request.count",
+]
+
+CUSTOM_METRICS_STREAMING = [
+    "http.server.streaming.duration",
+    "http.server.streaming.count",
+]
+
+TEST_CASES = [
+    TelemetryTestCase(
+        name="models_list",
+        http_method="GET",
+        api_path="/v1/models",
+        expected_trace_exports=1,  # Single trace with 2-3 spans (GET, http send)
+        expected_metric_exports=1,  # Metrics export periodically, but we'll wait for them
+        expected_metrics=[],  # First request: middleware may not be initialized yet
+        expected_min_spans=2,
+    ),
+    TelemetryTestCase(
+        name="chat_completion",
+        http_method="POST",
+        api_path="/v1/chat/completions",
+        request_body={
+            "model": "meta-llama/Llama-3.2-1B-Instruct",
+            "messages": [{"role": "user", "content": "Hello!"}],
+        },
+        expected_trace_exports=1,  # Single trace with 4 spans (POST, http receive, 2x http send)
+        expected_metric_exports=1,  # Metrics export periodically
+        expected_metrics=CUSTOM_METRICS_BASE,
+        expected_min_spans=3,
+    ),
+    TelemetryTestCase(
+        name="chat_completion_streaming",
+        http_method="POST",
+        api_path="/v1/chat/completions",
+        request_body={
+            "model": "meta-llama/Llama-3.2-1B-Instruct",
+            "messages": [{"role": "user", "content": "Streaming test"}],
+            "stream": True,  # Enable streaming response
+        },
+        expected_trace_exports=1,  # Single trace with streaming spans
+        expected_metric_exports=1,  # Metrics export periodically
+        # Validate both base and streaming metrics with polling
+        expected_metrics=CUSTOM_METRICS_BASE + CUSTOM_METRICS_STREAMING,
+        expected_min_spans=4,
+    ),
+]
+
+
+# ============================================================================
+# TEST INFRASTRUCTURE
+# ============================================================================
+
+
+class TelemetryTestRunner:
+    """
+    Executes TelemetryTestCase instances against real Llama Stack.
+
+    **HOW IT WORKS:**
+    1. Makes real HTTP request to the stack
+    2. Waits for telemetry export
+    3. Verifies exports were sent to mock collector
+    4. Validates custom metrics by name (if expected_metrics is specified)
+    5. Ensures metrics have non-empty data points
+    """
+
+    def __init__(
+        self,
+        base_url: str,
+        collector: MockOTLPCollector,
+        poll_timeout_seconds: float = 8.0,
+        poll_interval_seconds: float = 0.1,
+    ):
+        self.base_url = base_url
+        self.collector = collector
+        self.poll_timeout_seconds = poll_timeout_seconds  # how long to wait for telemetry to be exported
+        self.poll_interval_seconds = poll_interval_seconds  # how often to poll for telemetry
+
+    def run_test_case(self, test_case: TelemetryTestCase, verbose: bool = False) -> bool:
+        """Execute a single test case and verify telemetry."""
+        initial_traces = self.collector.get_trace_count()
+        prior_trace_ids = self.collector.get_all_trace_ids()
+        initial_metrics = self.collector.get_metric_count()
+
+        if verbose:
+            print(f"\n--- {test_case.name} ---")
+            print(f"  {test_case.http_method} {test_case.api_path}")
+            if test_case.expected_metrics:
+                print(f"  Expected custom metrics: {', '.join(test_case.expected_metrics)}")
+
+        # Make real HTTP request to Llama Stack
+        is_streaming_test = test_case.request_body and test_case.request_body.get("stream", False)
+        try:
+            url = f"{self.base_url}{test_case.api_path}"
+
+            # Streaming requests need longer timeout to complete
+            timeout = 10 if is_streaming_test else 5
+
+            if test_case.http_method == "GET":
+                response = requests.get(url, timeout=timeout)
+            elif test_case.http_method == "POST":
+                response = requests.post(url, json=test_case.request_body or {}, timeout=timeout)
+            else:
+                response = requests.request(test_case.http_method, url, timeout=timeout)
+
+            if verbose:
+                print(f"  HTTP Response: {response.status_code}")
+
+            status_match = response.status_code == test_case.expected_http_status
+
+        except requests.exceptions.RequestException as e:
+            if verbose:
+                print(f"  Request exception: {type(e).__name__}")
+            # For streaming requests, exceptions are expected due to mock server behavior
+            # The important part is whether telemetry metrics were captured
+            status_match = is_streaming_test  # Pass streaming tests, fail non-streaming
+
+        # Poll until all telemetry expectations are met or timeout (single loop for speed)
+        missing_metrics: list[str] = []
+        empty_metrics: list[str] = []
+        new_trace_ids: set[str] = set()
+
+        def compute_status() -> tuple[bool, bool, bool, bool]:
+            traces_ok_local = (self.collector.get_trace_count() - initial_traces) >= test_case.expected_trace_exports
+            metrics_count_ok_local = (
+                self.collector.get_metric_count() - initial_metrics
+            ) >= test_case.expected_metric_exports
+
+            metrics_ok_local = True
+            if test_case.expected_metrics:
+                missing_metrics.clear()
+                empty_metrics.clear()
+                for metric_name in test_case.expected_metrics:
+                    if not self.collector.has_metric(metric_name):
+                        missing_metrics.append(metric_name)
+                    else:
+                        data_points = self.collector.get_metric_by_name(metric_name)
+                        if len(data_points) == 0:
+                            empty_metrics.append(metric_name)
+                metrics_ok_local = len(missing_metrics) == 0 and len(empty_metrics) == 0
+
+            spans_ok_local = True
+            if test_case.expected_min_spans is not None:
+                nonlocal new_trace_ids
+                new_trace_ids = self.collector.get_new_trace_ids(prior_trace_ids)
+                if not new_trace_ids:
+                    spans_ok_local = False
+                else:
+                    counts = self.collector.get_trace_span_counts()
+                    min_spans: int = int(test_case.expected_min_spans or 0)
+                    spans_ok_local = all(counts.get(tid, 0) >= min_spans for tid in new_trace_ids)
+
+            return traces_ok_local, metrics_count_ok_local, metrics_ok_local, spans_ok_local
+
+        # Poll until all telemetry expectations are met or timeout (single loop for speed)
+        start = time.time()
+        traces_ok, metrics_count_ok, metrics_by_name_validated, spans_ok = compute_status()
+        while time.time() - start < self.poll_timeout_seconds:
+            if traces_ok and metrics_count_ok and metrics_by_name_validated and spans_ok:
+                break
+            time.sleep(self.poll_interval_seconds)
+            traces_ok, metrics_count_ok, metrics_by_name_validated, spans_ok = compute_status()
+
+        if verbose:
+            total_http_requests = len(getattr(self.collector, "all_http_requests", []))
+            print(f"  [DEBUG] OTLP POST requests: {total_http_requests}")
+            print(
+                f"  Expected: >={test_case.expected_trace_exports} traces, >={test_case.expected_metric_exports} metrics"
+            )
+            print(
+                f"  Actual: {self.collector.get_trace_count() - initial_traces} traces, {self.collector.get_metric_count() - initial_metrics} metrics"
+            )
+
+            if test_case.expected_metrics:
+                print("  Custom metrics:")
+                for metric_name in test_case.expected_metrics:
+                    n = len(self.collector.get_metric_by_name(metric_name))
+                    status = "✓" if n > 0 else "✗"
+                    print(f"    {status} {metric_name}: {n}")
+                if missing_metrics:
+                    print(f"  Missing: {missing_metrics}")
+                if empty_metrics:
+                    print(f"  Empty: {empty_metrics}")
+
+            if test_case.expected_min_spans is not None:
+                counts = self.collector.get_trace_span_counts()
+                span_counts = {tid: counts[tid] for tid in new_trace_ids}
+                print(f"  New trace IDs: {sorted(new_trace_ids)}")
+                print(f"  Span counts: {span_counts}")
+
+            result = bool(
+                (status_match or is_streaming_test)
+                and traces_ok
+                and metrics_count_ok
+                and metrics_by_name_validated
+                and spans_ok
+            )
+            print(f"  Result: {'PASS' if result else 'FAIL'}")
+
+        return bool(
+            (status_match or is_streaming_test)
+            and traces_ok
+            and metrics_count_ok
+            and metrics_by_name_validated
+            and spans_ok
+        )
+
+    def run_all_test_cases(self, test_cases: list[TelemetryTestCase], verbose: bool = True) -> dict[str, bool]:
+        """Run all test cases and return results."""
+        results = {}
+        for test_case in test_cases:
+            results[test_case.name] = self.run_test_case(test_case, verbose=verbose)
+        return results
+
+
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+
+
+def is_port_available(port: int) -> bool:
+    """Check if a TCP port is available for binding."""
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            sock.bind(("localhost", port))
+            return True
+    except OSError:
+        return False
+
+
+# ============================================================================
+# PYTEST FIXTURES
+# ============================================================================
+
+
+@pytest.fixture(scope="module")
+def mock_servers():
+    """
+    Fixture: Start all mock servers in parallel using async harness.
+
+    **TO ADD A NEW MOCK SERVER:**
+    Just add a MockServerConfig to the MOCK_SERVERS list below.
+    """
+    import asyncio
+
+    # ========================================================================
+    # MOCK SERVER CONFIGURATION
+    # **TO ADD A NEW MOCK:** Just add a MockServerConfig instance below
+    #
+    # Example:
+    #   MockServerConfig(
+    #       name="Mock MyService",
+    #       server_class=MockMyService,  # Must inherit from MockServerBase
+    #       init_kwargs={"port": 9000, "param": "value"},
+    #   ),
+    # ========================================================================
+    mock_servers_config = [
+        MockServerConfig(
+            name="Mock OTLP Collector",
+            server_class=MockOTLPCollector,
+            init_kwargs={"port": 4318},
+        ),
+        MockServerConfig(
+            name="Mock vLLM Server",
+            server_class=MockVLLMServer,
+            init_kwargs={
+                "port": 8000,
+                "models": ["meta-llama/Llama-3.2-1B-Instruct"],
+            },
+        ),
+        # Add more mock servers here - they will start in parallel automatically!
+    ]
+
+    # Start all servers in parallel
+    servers = asyncio.run(start_mock_servers_async(mock_servers_config))
+
+    # Verify vLLM models
+    models_response = requests.get("http://localhost:8000/v1/models", timeout=1)
+    models_data = models_response.json()
+    print(f"[INFO] Mock vLLM serving {len(models_data['data'])} models: {[m['id'] for m in models_data['data']]}")
+
+    yield servers
+
+    # Stop all servers
+    stop_mock_servers(servers)
+
+
+@pytest.fixture(scope="module")
+def mock_otlp_collector(mock_servers):
+    """Convenience fixture to get OTLP collector from mock_servers."""
+    return mock_servers["Mock OTLP Collector"]
+
+
+@pytest.fixture(scope="module")
+def mock_vllm_server(mock_servers):
+    """Convenience fixture to get vLLM server from mock_servers."""
+    return mock_servers["Mock vLLM Server"]
+
+
+@pytest.fixture(scope="module")
+def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
+    """
+    Fixture: Start real Llama Stack server with inline OTel provider.
+
+    **THIS IS THE MAIN FIXTURE** - it runs:
+        opentelemetry-instrument llama stack run --config run.yaml
+
+    **TO MODIFY STACK CONFIG:** Edit run_config dict below
+    """
+    config_dir = tmp_path_factory.mktemp("otel-stack-config")
+
+    # Ensure mock vLLM is ready and accessible before starting Llama Stack
+    print("\n[INFO] Verifying mock vLLM is accessible at http://localhost:8000...")
+    try:
+        vllm_models = requests.get("http://localhost:8000/v1/models", timeout=2)
+        print(f"[INFO] Mock vLLM models endpoint response: {vllm_models.status_code}")
+    except Exception as e:
+        pytest.fail(f"Mock vLLM not accessible before starting Llama Stack: {e}")
+
+    # Create run.yaml with inference and telemetry providers
+    # **TO ADD MORE PROVIDERS:** Add to providers dict
+    run_config = {
+        "image_name": "test-otel-e2e",
+        "apis": ["inference"],
+        "providers": {
+            "inference": [
+                {
+                    "provider_id": "vllm",
+                    "provider_type": "remote::vllm",
+                    "config": {
+                        "url": "http://localhost:8000/v1",
+                    },
+                },
+            ],
+            "telemetry": [
+                {
+                    "provider_id": "otel",
+                    "provider_type": "inline::otel",
+                    "config": {
+                        "service_name": "llama-stack-e2e-test",
+                        "span_processor": "simple",
+                    },
+                },
+            ],
+        },
+        "models": [
+            {
+                "model_id": "meta-llama/Llama-3.2-1B-Instruct",
+                "provider_id": "vllm",
+            }
+        ],
+    }
+
+    config_file = config_dir / "run.yaml"
+    with open(config_file, "w") as f:
+        yaml.dump(run_config, f)
+
+    # Find available port for Llama Stack
+    port = 5555
+    while not is_port_available(port) and port < 5600:
+        port += 1
+
+    if port >= 5600:
+        pytest.skip("No available ports for test server")
+
+    # Set environment variables for OTel instrumentation
+    # NOTE: These only affect the subprocess, not other tests
+    env = os.environ.copy()
+    env["OTEL_EXPORTER_OTLP_ENDPOINT"] = "http://localhost:4318"
+    env["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"  # Ensure correct protocol
+    env["OTEL_SERVICE_NAME"] = "llama-stack-e2e-test"
+    env["OTEL_SPAN_PROCESSOR"] = "simple"  # Force simple processor for immediate export
+    env["LLAMA_STACK_PORT"] = str(port)
+    env["OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED"] = "true"
+
+    # Configure fast metric export for testing (default is 60 seconds)
+    # This makes metrics export every 500ms instead of every 60 seconds
+    env["OTEL_METRIC_EXPORT_INTERVAL"] = "500"  # milliseconds
+    env["OTEL_METRIC_EXPORT_TIMEOUT"] = "1000"  # milliseconds
+
+    # Disable inference recording to ensure real requests to our mock vLLM
+    # This is critical - without this, Llama Stack replays cached responses
+    # Safe to remove here as it only affects the subprocess environment
+    if "LLAMA_STACK_TEST_INFERENCE_MODE" in env:
+        del env["LLAMA_STACK_TEST_INFERENCE_MODE"]
+
+    # Start server with automatic instrumentation
+    cmd = [
+        "opentelemetry-instrument",  # ← Automatic instrumentation wrapper
+        "llama",
+        "stack",
+        "run",
+        str(config_file),
+        "--port",
+        str(port),
+    ]
+
+    print(f"\n[INFO] Starting Llama Stack with OTel instrumentation on port {port}")
+    print(f"[INFO] Command: {' '.join(cmd)}")
+
+    process = subprocess.Popen(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,  # Merge stderr into stdout
+        text=True,
+    )
+
+    # Wait for server to start
+    max_wait = 30
+    base_url = f"http://localhost:{port}"
+    startup_output = []
+
+    for i in range(max_wait):
+        # Collect server output non-blocking
+        import select
+
+        if process.stdout and select.select([process.stdout], [], [], 0)[0]:
+            line = process.stdout.readline()
+            if line:
+                startup_output.append(line)
+
+        try:
+            response = requests.get(f"{base_url}/v1/health", timeout=1)
+            if response.status_code == 200:
+                print(f"[INFO] Server ready at {base_url}")
+                # Print relevant initialization logs
+                print(f"[DEBUG] Captured {len(startup_output)} lines of server output")
+                relevant_logs = [
+                    line
+                    for line in startup_output
+                    if any(keyword in line.lower() for keyword in ["telemetry", "otel", "provider", "error creating"])
+                ]
+                if relevant_logs:
+                    print("[DEBUG] Relevant server logs:")
+                    for log in relevant_logs[-10:]:  # Last 10 relevant lines
+                        print(f"  {log.strip()}")
+                time.sleep(0.5)
+                break
+        except requests.exceptions.RequestException:
+            if i == max_wait - 1:
+                process.terminate()
+                stdout, _ = process.communicate(timeout=5)
+                pytest.fail(f"Server failed to start.\nOutput: {stdout}")
+            time.sleep(1)
+
+    yield {
+        "base_url": base_url,
+        "port": port,
+        "collector": mock_otlp_collector,
+        "vllm_server": mock_vllm_server,
+    }
+
+    # Cleanup
+    print("\n[INFO] Stopping Llama Stack server")
+    process.terminate()
+    try:
+        process.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        process.kill()
+
+
+# ============================================================================
+# TESTS: End-to-End with Real Stack
+# **THESE RUN SLOW** - marked with @pytest.mark.slow
+# **TO ADD NEW E2E TESTS:** Add methods to this class
+# ============================================================================
+
+
+@pytest.mark.slow
+class TestOTelE2E:
+    """
+    End-to-end tests with real Llama Stack server.
+
+    These tests verify the complete flow:
+    - Real Llama Stack with inline OTel provider
+    - Real API calls
+    - Automatic trace and metric collection
+    - Mock OTLP collector captures exports
+    """
+
+    def test_server_starts_with_auto_instrumentation(self, llama_stack_server):
+        """Verify server starts successfully with inline OTel provider."""
+        base_url = llama_stack_server["base_url"]
+
+        # Try different health check endpoints
+        health_endpoints = ["/health", "/v1/health", "/"]
+        server_responding = False
+
+        for endpoint in health_endpoints:
+            try:
+                response = requests.get(f"{base_url}{endpoint}", timeout=5)
+                print(f"\n[DEBUG] {endpoint} -> {response.status_code}")
+                if response.status_code == 200:
+                    server_responding = True
+                    break
+            except Exception as e:
+                print(f"[DEBUG] {endpoint} failed: {e}")
+
+        assert server_responding, f"Server not responding on any endpoint at {base_url}"
+
+        print(f"\n[PASS] Llama Stack running with OTel at {base_url}")
+
+    def test_all_test_cases_via_runner(self, llama_stack_server):
+        """
+        **MAIN TEST:** Run all TelemetryTestCase instances with custom metrics validation.
+
+        This executes all test cases defined in TEST_CASES list and validates:
+        1. Traces are exported to the collector
+        2. Metrics are exported to the collector
+        3. Custom metrics (defined in CUSTOM_METRICS_BASE, CUSTOM_METRICS_STREAMING)
+           are captured by name with non-empty data points
+
+        Each test case specifies which metrics to validate via expected_metrics field.
+
+        **TO ADD MORE TESTS:**
+        - Add TelemetryTestCase to TEST_CASES (line ~132)
+        - Reference CUSTOM_METRICS_BASE or CUSTOM_METRICS_STREAMING in expected_metrics
+        - See examples in existing test cases
+
+        **TO ADD NEW METRICS:**
+        - Add metric to otel.py
+        - Add metric name to CUSTOM_METRICS_BASE or CUSTOM_METRICS_STREAMING (line ~122)
+        - Update test cases that should validate it
+        """
+        base_url = llama_stack_server["base_url"]
+        collector = llama_stack_server["collector"]
+
+        # Create test runner
+        runner = TelemetryTestRunner(base_url, collector)
+
+        # Execute all test cases (set verbose=False for cleaner output)
+        results = runner.run_all_test_cases(TEST_CASES, verbose=False)
+
+        print(f"\n{'=' * 50}\nTEST CASE SUMMARY\n{'=' * 50}")
+        passed = sum(1 for p in results.values() if p)
+        total = len(results)
+        print(f"Passed: {passed}/{total}\n")
+
+        failed = [name for name, ok in results.items() if not ok]
+        for name, ok in results.items():
+            print(f"  {'[PASS]' if ok else '[FAIL]'} {name}")
+
+        print(f"{'=' * 50}\n")
+        assert not failed, f"Some test cases failed: {failed}"