# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. """ End-to-end tests for the OpenTelemetry inline provider. What this does: - Boots mock OTLP and mock vLLM - Starts a real Llama Stack with inline OTel - Calls real HTTP APIs - Verifies traces, metrics, and custom metric names (non-empty) """ # ============================================================================ # IMPORTS # ============================================================================ import os import socket import subprocess import time from typing import Any import pytest import requests import yaml from pydantic import BaseModel, Field # Mock servers are in the mocking/ subdirectory from .mocking import ( MockOTLPCollector, MockServerConfig, MockVLLMServer, start_mock_servers_async, stop_mock_servers, ) # ============================================================================ # DATA MODELS # ============================================================================ class TelemetryTestCase(BaseModel): """ Pydantic model defining expected telemetry for an API call. **TO ADD A NEW TEST CASE:** Add to TEST_CASES list below. """ name: str = Field(description="Unique test case identifier") http_method: str = Field(description="HTTP method (GET, POST, etc.)") api_path: str = Field(description="API path (e.g., '/v1/models')") request_body: dict[str, Any] | None = Field(default=None) expected_http_status: int = Field(default=200) expected_trace_exports: int = Field(default=1, description="Minimum number of trace exports expected") expected_metric_exports: int = Field(default=0, description="Minimum number of metric exports expected") should_have_error_span: bool = Field(default=False) expected_metrics: list[str] = Field( default_factory=list, description="List of metric names that should be captured" ) expected_min_spans: int | None = Field( default=None, description="If set, minimum number of spans expected in the new trace(s) generated by this test" ) # ============================================================================ # TEST CONFIGURATION # **TO ADD NEW TESTS:** Add TelemetryTestCase instances here # ============================================================================ # Custom metric names (defined in llama_stack/providers/inline/telemetry/otel/otel.py) CUSTOM_METRICS_BASE = [ "http.server.request.duration", "http.server.request.count", ] CUSTOM_METRICS_STREAMING = [ "http.server.streaming.duration", "http.server.streaming.count", ] TEST_CASES = [ TelemetryTestCase( name="models_list", http_method="GET", api_path="/v1/models", expected_trace_exports=1, # Single trace with 2-3 spans (GET, http send) expected_metric_exports=1, # Metrics export periodically, but we'll wait for them expected_metrics=[], # First request: middleware may not be initialized yet expected_min_spans=2, ), TelemetryTestCase( name="chat_completion", http_method="POST", api_path="/v1/chat/completions", request_body={ "model": "meta-llama/Llama-3.2-1B-Instruct", "messages": [{"role": "user", "content": "Hello!"}], }, expected_trace_exports=1, # Single trace with 4 spans (POST, http receive, 2x http send) expected_metric_exports=1, # Metrics export periodically expected_metrics=CUSTOM_METRICS_BASE, expected_min_spans=3, ), TelemetryTestCase( name="chat_completion_streaming", http_method="POST", api_path="/v1/chat/completions", request_body={ "model": "meta-llama/Llama-3.2-1B-Instruct", "messages": [{"role": "user", "content": "Streaming test"}], "stream": True, # Enable streaming response }, expected_trace_exports=1, # Single trace with streaming spans expected_metric_exports=1, # Metrics export periodically # Validate both base and streaming metrics with polling expected_metrics=CUSTOM_METRICS_BASE + CUSTOM_METRICS_STREAMING, expected_min_spans=4, ), ] # ============================================================================ # TEST INFRASTRUCTURE # ============================================================================ class TelemetryTestRunner: """ Executes TelemetryTestCase instances against real Llama Stack. **HOW IT WORKS:** 1. Makes real HTTP request to the stack 2. Waits for telemetry export 3. Verifies exports were sent to mock collector 4. Validates custom metrics by name (if expected_metrics is specified) 5. Ensures metrics have non-empty data points """ def __init__( self, base_url: str, collector: MockOTLPCollector, poll_timeout_seconds: float = 8.0, poll_interval_seconds: float = 0.1, ): self.base_url = base_url self.collector = collector self.poll_timeout_seconds = poll_timeout_seconds # how long to wait for telemetry to be exported self.poll_interval_seconds = poll_interval_seconds # how often to poll for telemetry def run_test_case(self, test_case: TelemetryTestCase, verbose: bool = False) -> bool: """Execute a single test case and verify telemetry.""" initial_traces = self.collector.get_trace_count() prior_trace_ids = self.collector.get_all_trace_ids() initial_metrics = self.collector.get_metric_count() if verbose: print(f"\n--- {test_case.name} ---") print(f" {test_case.http_method} {test_case.api_path}") if test_case.expected_metrics: print(f" Expected custom metrics: {', '.join(test_case.expected_metrics)}") # Make real HTTP request to Llama Stack is_streaming_test = test_case.request_body and test_case.request_body.get("stream", False) try: url = f"{self.base_url}{test_case.api_path}" # Streaming requests need longer timeout to complete timeout = 10 if is_streaming_test else 5 if test_case.http_method == "GET": response = requests.get(url, timeout=timeout) elif test_case.http_method == "POST": response = requests.post(url, json=test_case.request_body or {}, timeout=timeout) else: response = requests.request(test_case.http_method, url, timeout=timeout) if verbose: print(f" HTTP Response: {response.status_code}") status_match = response.status_code == test_case.expected_http_status except requests.exceptions.RequestException as e: if verbose: print(f" Request exception: {type(e).__name__}") # For streaming requests, exceptions are expected due to mock server behavior # The important part is whether telemetry metrics were captured status_match = is_streaming_test # Pass streaming tests, fail non-streaming # Poll until all telemetry expectations are met or timeout (single loop for speed) missing_metrics: list[str] = [] empty_metrics: list[str] = [] new_trace_ids: set[str] = set() def compute_status() -> tuple[bool, bool, bool, bool]: traces_ok_local = (self.collector.get_trace_count() - initial_traces) >= test_case.expected_trace_exports metrics_count_ok_local = ( self.collector.get_metric_count() - initial_metrics ) >= test_case.expected_metric_exports metrics_ok_local = True if test_case.expected_metrics: missing_metrics.clear() empty_metrics.clear() for metric_name in test_case.expected_metrics: if not self.collector.has_metric(metric_name): missing_metrics.append(metric_name) else: data_points = self.collector.get_metric_by_name(metric_name) if len(data_points) == 0: empty_metrics.append(metric_name) metrics_ok_local = len(missing_metrics) == 0 and len(empty_metrics) == 0 spans_ok_local = True if test_case.expected_min_spans is not None: nonlocal new_trace_ids new_trace_ids = self.collector.get_new_trace_ids(prior_trace_ids) if not new_trace_ids: spans_ok_local = False else: counts = self.collector.get_trace_span_counts() min_spans: int = int(test_case.expected_min_spans or 0) spans_ok_local = all(counts.get(tid, 0) >= min_spans for tid in new_trace_ids) return traces_ok_local, metrics_count_ok_local, metrics_ok_local, spans_ok_local # Poll until all telemetry expectations are met or timeout (single loop for speed) start = time.time() traces_ok, metrics_count_ok, metrics_by_name_validated, spans_ok = compute_status() while time.time() - start < self.poll_timeout_seconds: if traces_ok and metrics_count_ok and metrics_by_name_validated and spans_ok: break time.sleep(self.poll_interval_seconds) traces_ok, metrics_count_ok, metrics_by_name_validated, spans_ok = compute_status() if verbose: total_http_requests = len(getattr(self.collector, "all_http_requests", [])) print(f" [DEBUG] OTLP POST requests: {total_http_requests}") print( f" Expected: >={test_case.expected_trace_exports} traces, >={test_case.expected_metric_exports} metrics" ) print( f" Actual: {self.collector.get_trace_count() - initial_traces} traces, {self.collector.get_metric_count() - initial_metrics} metrics" ) if test_case.expected_metrics: print(" Custom metrics:") for metric_name in test_case.expected_metrics: n = len(self.collector.get_metric_by_name(metric_name)) status = "✓" if n > 0 else "✗" print(f" {status} {metric_name}: {n}") if missing_metrics: print(f" Missing: {missing_metrics}") if empty_metrics: print(f" Empty: {empty_metrics}") if test_case.expected_min_spans is not None: counts = self.collector.get_trace_span_counts() span_counts = {tid: counts[tid] for tid in new_trace_ids} print(f" New trace IDs: {sorted(new_trace_ids)}") print(f" Span counts: {span_counts}") result = bool( (status_match or is_streaming_test) and traces_ok and metrics_count_ok and metrics_by_name_validated and spans_ok ) print(f" Result: {'PASS' if result else 'FAIL'}") return bool( (status_match or is_streaming_test) and traces_ok and metrics_count_ok and metrics_by_name_validated and spans_ok ) def run_all_test_cases(self, test_cases: list[TelemetryTestCase], verbose: bool = True) -> dict[str, bool]: """Run all test cases and return results.""" results = {} for test_case in test_cases: results[test_case.name] = self.run_test_case(test_case, verbose=verbose) return results # ============================================================================ # HELPER FUNCTIONS # ============================================================================ def is_port_available(port: int) -> bool: """Check if a TCP port is available for binding.""" try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.bind(("localhost", port)) return True except OSError: return False # ============================================================================ # PYTEST FIXTURES # ============================================================================ @pytest.fixture(scope="module") def mock_servers(): """ Fixture: Start all mock servers in parallel using async harness. **TO ADD A NEW MOCK SERVER:** Just add a MockServerConfig to the MOCK_SERVERS list below. """ import asyncio # ======================================================================== # MOCK SERVER CONFIGURATION # **TO ADD A NEW MOCK:** Just add a MockServerConfig instance below # # Example: # MockServerConfig( # name="Mock MyService", # server_class=MockMyService, # Must inherit from MockServerBase # init_kwargs={"port": 9000, "param": "value"}, # ), # ======================================================================== mock_servers_config = [ MockServerConfig( name="Mock OTLP Collector", server_class=MockOTLPCollector, init_kwargs={"port": 4318}, ), MockServerConfig( name="Mock vLLM Server", server_class=MockVLLMServer, init_kwargs={ "port": 8000, "models": ["meta-llama/Llama-3.2-1B-Instruct"], }, ), # Add more mock servers here - they will start in parallel automatically! ] # Start all servers in parallel servers = asyncio.run(start_mock_servers_async(mock_servers_config)) # Verify vLLM models models_response = requests.get("http://localhost:8000/v1/models", timeout=1) models_data = models_response.json() print(f"[INFO] Mock vLLM serving {len(models_data['data'])} models: {[m['id'] for m in models_data['data']]}") yield servers # Stop all servers stop_mock_servers(servers) @pytest.fixture(scope="module") def mock_otlp_collector(mock_servers): """Convenience fixture to get OTLP collector from mock_servers.""" return mock_servers["Mock OTLP Collector"] @pytest.fixture(scope="module") def mock_vllm_server(mock_servers): """Convenience fixture to get vLLM server from mock_servers.""" return mock_servers["Mock vLLM Server"] @pytest.fixture(scope="module") def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server): """ Fixture: Start real Llama Stack server with inline OTel provider. **THIS IS THE MAIN FIXTURE** - it runs: opentelemetry-instrument llama stack run --config run.yaml **TO MODIFY STACK CONFIG:** Edit run_config dict below """ config_dir = tmp_path_factory.mktemp("otel-stack-config") # Ensure mock vLLM is ready and accessible before starting Llama Stack print("\n[INFO] Verifying mock vLLM is accessible at http://localhost:8000...") try: vllm_models = requests.get("http://localhost:8000/v1/models", timeout=2) print(f"[INFO] Mock vLLM models endpoint response: {vllm_models.status_code}") except Exception as e: pytest.fail(f"Mock vLLM not accessible before starting Llama Stack: {e}") # Create run.yaml with inference and telemetry providers # **TO ADD MORE PROVIDERS:** Add to providers dict run_config = { "image_name": "test-otel-e2e", "apis": ["inference"], "providers": { "inference": [ { "provider_id": "vllm", "provider_type": "remote::vllm", "config": { "url": "http://localhost:8000/v1", }, }, ], "telemetry": [ { "provider_id": "otel", "provider_type": "inline::otel", "config": { "service_name": "llama-stack-e2e-test", "span_processor": "simple", }, }, ], }, "models": [ { "model_id": "meta-llama/Llama-3.2-1B-Instruct", "provider_id": "vllm", } ], } config_file = config_dir / "run.yaml" with open(config_file, "w") as f: yaml.dump(run_config, f) # Find available port for Llama Stack port = 5555 while not is_port_available(port) and port < 5600: port += 1 if port >= 5600: pytest.skip("No available ports for test server") # Set environment variables for OTel instrumentation # NOTE: These only affect the subprocess, not other tests env = os.environ.copy() env["OTEL_EXPORTER_OTLP_ENDPOINT"] = "http://localhost:4318" env["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf" # Ensure correct protocol env["OTEL_SERVICE_NAME"] = "llama-stack-e2e-test" env["OTEL_SPAN_PROCESSOR"] = "simple" # Force simple processor for immediate export env["LLAMA_STACK_PORT"] = str(port) env["OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED"] = "true" # Configure fast metric export for testing (default is 60 seconds) # This makes metrics export every 500ms instead of every 60 seconds env["OTEL_METRIC_EXPORT_INTERVAL"] = "500" # milliseconds env["OTEL_METRIC_EXPORT_TIMEOUT"] = "1000" # milliseconds # Disable inference recording to ensure real requests to our mock vLLM # This is critical - without this, Llama Stack replays cached responses # Safe to remove here as it only affects the subprocess environment if "LLAMA_STACK_TEST_INFERENCE_MODE" in env: del env["LLAMA_STACK_TEST_INFERENCE_MODE"] # Start server with automatic instrumentation cmd = [ "opentelemetry-instrument", # ← Automatic instrumentation wrapper "llama", "stack", "run", str(config_file), "--port", str(port), ] print(f"\n[INFO] Starting Llama Stack with OTel instrumentation on port {port}") print(f"[INFO] Command: {' '.join(cmd)}") process = subprocess.Popen( cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, # Merge stderr into stdout text=True, ) # Wait for server to start max_wait = 30 base_url = f"http://localhost:{port}" startup_output = [] for i in range(max_wait): # Collect server output non-blocking import select if process.stdout and select.select([process.stdout], [], [], 0)[0]: line = process.stdout.readline() if line: startup_output.append(line) try: response = requests.get(f"{base_url}/v1/health", timeout=1) if response.status_code == 200: print(f"[INFO] Server ready at {base_url}") # Print relevant initialization logs print(f"[DEBUG] Captured {len(startup_output)} lines of server output") relevant_logs = [ line for line in startup_output if any(keyword in line.lower() for keyword in ["telemetry", "otel", "provider", "error creating"]) ] if relevant_logs: print("[DEBUG] Relevant server logs:") for log in relevant_logs[-10:]: # Last 10 relevant lines print(f" {log.strip()}") time.sleep(0.5) break except requests.exceptions.RequestException: if i == max_wait - 1: process.terminate() stdout, _ = process.communicate(timeout=5) pytest.fail(f"Server failed to start.\nOutput: {stdout}") time.sleep(1) yield { "base_url": base_url, "port": port, "collector": mock_otlp_collector, "vllm_server": mock_vllm_server, } # Cleanup print("\n[INFO] Stopping Llama Stack server") process.terminate() try: process.wait(timeout=5) except subprocess.TimeoutExpired: process.kill() # ============================================================================ # TESTS: End-to-End with Real Stack # **THESE RUN SLOW** - marked with @pytest.mark.slow # **TO ADD NEW E2E TESTS:** Add methods to this class # ============================================================================ @pytest.mark.slow class TestOTelE2E: """ End-to-end tests with real Llama Stack server. These tests verify the complete flow: - Real Llama Stack with inline OTel provider - Real API calls - Automatic trace and metric collection - Mock OTLP collector captures exports """ def test_server_starts_with_auto_instrumentation(self, llama_stack_server): """Verify server starts successfully with inline OTel provider.""" base_url = llama_stack_server["base_url"] # Try different health check endpoints health_endpoints = ["/health", "/v1/health", "/"] server_responding = False for endpoint in health_endpoints: try: response = requests.get(f"{base_url}{endpoint}", timeout=5) print(f"\n[DEBUG] {endpoint} -> {response.status_code}") if response.status_code == 200: server_responding = True break except Exception as e: print(f"[DEBUG] {endpoint} failed: {e}") assert server_responding, f"Server not responding on any endpoint at {base_url}" print(f"\n[PASS] Llama Stack running with OTel at {base_url}") def test_all_test_cases_via_runner(self, llama_stack_server): """ **MAIN TEST:** Run all TelemetryTestCase instances with custom metrics validation. This executes all test cases defined in TEST_CASES list and validates: 1. Traces are exported to the collector 2. Metrics are exported to the collector 3. Custom metrics (defined in CUSTOM_METRICS_BASE, CUSTOM_METRICS_STREAMING) are captured by name with non-empty data points Each test case specifies which metrics to validate via expected_metrics field. **TO ADD MORE TESTS:** - Add TelemetryTestCase to TEST_CASES (line ~132) - Reference CUSTOM_METRICS_BASE or CUSTOM_METRICS_STREAMING in expected_metrics - See examples in existing test cases **TO ADD NEW METRICS:** - Add metric to otel.py - Add metric name to CUSTOM_METRICS_BASE or CUSTOM_METRICS_STREAMING (line ~122) - Update test cases that should validate it """ base_url = llama_stack_server["base_url"] collector = llama_stack_server["collector"] # Create test runner runner = TelemetryTestRunner(base_url, collector) # Execute all test cases (set verbose=False for cleaner output) results = runner.run_all_test_cases(TEST_CASES, verbose=False) print(f"\n{'=' * 50}\nTEST CASE SUMMARY\n{'=' * 50}") passed = sum(1 for p in results.values() if p) total = len(results) print(f"Passed: {passed}/{total}\n") failed = [name for name, ok in results.items() if not ok] for name, ok in results.items(): print(f" {'[PASS]' if ok else '[FAIL]'} {name}") print(f"{'=' * 50}\n") assert not failed, f"Some test cases failed: {failed}"