fix(major::pr): re-architect instrumentation library

2025-12-12 20:12:33 +00:00 · 2025-10-06 17:54:05 -04:00 · 2025-10-06 17:54:05 -04:00 · 8fe3a25158
commit 8fe3a25158
parent 7e3cf1fb20
21 changed files with 422 additions and 462 deletions
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -9,7 +9,7 @@ from pathlib import Path
 from typing import Annotated, Any, Literal, Self
 from urllib.parse import urlparse

-from pydantic import BaseModel, Field, field_validator, model_validator
+from pydantic import BaseModel, Field, TypeAdapter, field_validator, model_validator

 from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
 from llama_stack.apis.datasetio import DatasetIO
@ -26,7 +26,10 @@ from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.access_control.datatypes import AccessRule
+from llama_stack.core.instrumentation import InstrumentationProvider
+from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.registry.instrumentation import instrumentation_registry
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import SqlStoreConfig

@ -493,6 +496,12 @@ If not specified, a default SQLite store will be used.""",

    logging: LoggingConfig | None = Field(default=None, description="Configuration for Llama Stack Logging")

+    # Middleware/instrumentation providers (not full APIs)
+    instrumentation: InstrumentationProvider | None = Field(
+        default=None,
+        description="Instrumentation provider for observability",
+    )
+
    server: ServerConfig = Field(
        default_factory=ServerConfig,
        description="Configuration for the HTTP(S) server",
@ -517,11 +526,31 @@ If not specified, a default SQLite store will be used.""",
            return Path(v)
        return v

+    @field_validator("instrumentation", mode="before")
+    @classmethod
+    def load_instrumentation(cls, v: InstrumentationProvider | dict[str, Any] | None):
+        if v is None or isinstance(v, InstrumentationProvider):
+            return v
+
+        provider_type = v.get("provider")
+        if not isinstance(provider_type, str):
+            raise ValueError("instrumentation.provider must be a string")
+
+        entry = instrumentation_registry.get(provider_type)
+        if entry is None:
+            raise ValueError(f"Unknown instrumentation provider: {provider_type}")
+
+        cfg_cls = instantiate_class_type(entry.config_class)
+        prv_cls = instantiate_class_type(entry.provider_class)
+        cfg_data = v.get("config") or {}
+        cfg = TypeAdapter(cfg_cls).validate_python(cfg_data)
+        return prv_cls(provider=provider_type, config=cfg)
+

 class BuildConfig(BaseModel):
    version: int = LLAMA_STACK_BUILD_CONFIG_VERSION

-    distribution_spec: DistributionSpec = Field(description="The distribution spec to build including API providers. ")
+    distribution_spec: DistributionSpec = Field(description="The distribution spec to build including API providers.")
    image_type: str = Field(
        default="venv",
        description="Type of package to build (container | venv)",
--- a/llama_stack/core/instrumentation.py
+++ b/llama_stack/core/instrumentation.py
@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Protocol for instrumentation providers."""
+
+from abc import abstractmethod
+
+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+
+
+class InstrumentationProvider(BaseModel):
+    """
+    Base class for instrumentation providers.
+
+    Instrumentation providers add observability (tracing, metrics, logs) to the
+    application but don't expose API endpoints.
+    """
+
+    provider: str = Field(description="Provider identifier for discriminated unions")
+    config: BaseModel
+
+    @abstractmethod
+    def fastapi_middleware(self, app: FastAPI) -> None:
+        """
+        Inject middleware into the FastAPI application.
+
+        :param app: The FastAPI application to instrument
+        """
+        ...
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -400,9 +400,9 @@ def create_app() -> StackApp:
        if cors_config:
            app.add_middleware(CORSMiddleware, **cors_config.model_dump())

-    if Api.telemetry in impls:
-        impls[Api.telemetry].fastapi_middleware(app)
-        impls[Api.telemetry].sqlalchemy_instrumentation()
+    # Apply instrumentation provider (e.g., OpenTelemetry)
+    if config.instrumentation:
+        config.instrumentation.fastapi_middleware(app)

    # Load external APIs if configured
    external_apis = load_external_apis(config)
--- a/llama_stack/core/telemetry/telemetry.py
+++ b/llama_stack/core/telemetry/telemetry.py
@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from abc import abstractmethod
-
-from fastapi import FastAPI
-from pydantic import BaseModel
-
-
-class TelemetryProvider(BaseModel):
-    """
-    TelemetryProvider standardizes how telemetry is provided to the application.
-    """
-
-    @abstractmethod
-    def fastapi_middleware(self, app: FastAPI, *args, **kwargs):
-        """
-        Injects FastAPI middleware that instruments the application for telemetry.
-        """
-        ...
--- a/llama_stack/providers/inline/instrumentation/otel/init.py
+++ b/llama_stack/providers/inline/instrumentation/otel/init.py
@ -3,8 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/inline/instrumentation/otel/config.py
+++ b/llama_stack/providers/inline/instrumentation/otel/config.py
@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class OTelConfig(BaseModel):
+    """
+    OpenTelemetry instrumentation configuration.
+
+    Most OTel settings use environment variables (OTEL_*).
+    See: https://opentelemetry.io/docs/specs/otel/configuration/sdk-configuration-variables/
+    """
+
+    service_name: str | None = Field(
+        default=None,
+        description="Service name (overridden by OTEL_SERVICE_NAME env var)",
+    )
+    span_processor: Literal["batch", "simple"] = Field(
+        default="batch",
+        description="Span processor type (overridden by OTEL_SPAN_PROCESSOR env var)",
+    )
--- a/llama_stack/providers/inline/instrumentation/otel/middleware.py
+++ b/llama_stack/providers/inline/instrumentation/otel/middleware.py
@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+
+from opentelemetry import trace
+from opentelemetry.metrics import Counter, Histogram
+from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
+from starlette.types import ASGIApp, Message, Receive, Scope, Send
+
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="instrumentation::otel")
+
+
+class StreamingMetricsMiddleware:
+    """
+    ASGI middleware to track streaming response metrics.
+
+    :param app: The ASGI app to wrap
+    """
+
+    def __init__(self, app: ASGIApp):
+        self.app = app
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] != "http":
+            await self.app(scope, receive, send)
+            return
+
+        logger.debug(f"StreamingMetricsMiddleware called for {scope.get('method')} {scope.get('path')}")
+        start_time = time.time()
+        is_streaming = False
+
+        async def send_wrapper(message: Message):
+            nonlocal is_streaming
+
+            # Detect streaming responses by headers
+            if message["type"] == "http.response.start":
+                headers = message.get("headers", [])
+                for name, value in headers:
+                    if name == b"content-type" and b"text/event-stream" in value:
+                        is_streaming = True
+                        # Add streaming attribute to current span
+                        current_span = trace.get_current_span()
+                        if current_span and current_span.is_recording():
+                            current_span.set_attribute("http.response.is_streaming", True)
+                        break
+
+            # Record total duration when response body completes
+            elif message["type"] == "http.response.body" and not message.get("more_body", False):
+                if is_streaming:
+                    current_span = trace.get_current_span()
+                    if current_span and current_span.is_recording():
+                        total_duration_ms = (time.time() - start_time) * 1000
+                        current_span.set_attribute("http.streaming.total_duration_ms", total_duration_ms)
+
+            await send(message)
+
+        await self.app(scope, receive, send_wrapper)
+
+
+class MetricsSpanExporter(SpanExporter):
+    """
+    Records additional custom HTTP metrics during otel span export.
+
+    :param request_duration: Histogram to record request duration
+    :param streaming_duration: Histogram to record streaming duration
+    :param streaming_requests: Counter to record streaming requests
+    :param request_count: Counter to record request count
+    """
+
+    def __init__(
+        self,
+        request_duration: Histogram,
+        streaming_duration: Histogram,
+        streaming_requests: Counter,
+        request_count: Counter,
+    ):
+        self.request_duration = request_duration
+        self.streaming_duration = streaming_duration
+        self.streaming_requests = streaming_requests
+        self.request_count = request_count
+
+    def export(self, spans):
+        for span in spans:
+            if not span.attributes or not span.attributes.get("http.method"):
+                continue
+            logger.debug(f"Processing span: {span.name}")
+
+            if span.end_time is None or span.start_time is None:
+                continue
+
+            duration_ms = (span.end_time - span.start_time) / 1_000_000
+            is_streaming = span.attributes.get("http.response.is_streaming", False)
+
+            attributes = {
+                "http.method": str(span.attributes.get("http.method", "UNKNOWN")),
+                "http.route": str(span.attributes.get("http.route", span.attributes.get("http.target", "/"))),
+                "http.status_code": str(span.attributes.get("http.status_code", 0)),
+                "trace_id": str(span.attributes.get("trace_id", "")),
+                "span_id": str(span.attributes.get("span_id", "")),
+            }
+
+            # Record request count and duration
+            logger.debug(f"Recording metrics: duration={duration_ms}ms, attributes={attributes}")
+            self.request_count.add(1, attributes)
+            self.request_duration.record(duration_ms, attributes)
+
+            if is_streaming:
+                logger.debug(f"MetricsSpanExporter: Recording streaming metrics for {span.name}")
+                self.streaming_requests.add(1, attributes)
+                stream_duration = span.attributes.get("http.streaming.total_duration_ms")
+                if stream_duration and isinstance(stream_duration, (int | float)):
+                    self.streaming_duration.record(float(stream_duration), attributes)
+
+        return SpanExportResult.SUCCESS
+
+    def shutdown(self):
+        pass
--- a/llama_stack/providers/inline/instrumentation/otel/otel.py
+++ b/llama_stack/providers/inline/instrumentation/otel/otel.py
@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+from fastapi import FastAPI
+from opentelemetry import metrics, trace
+from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (
+    BatchSpanProcessor,
+    SimpleSpanProcessor,
+)
+
+from llama_stack.core.instrumentation import InstrumentationProvider
+from llama_stack.log import get_logger
+
+from .config import OTelConfig
+from .middleware import MetricsSpanExporter, StreamingMetricsMiddleware
+
+logger = get_logger(name=__name__, category="instrumentation::otel")
+
+
+class OTelInstrumentationProvider(InstrumentationProvider):
+    """OpenTelemetry instrumentation provider."""
+
+    provider: str = "otel"  # Discriminator value
+
+    def model_post_init(self, __context):
+        """Initialize OpenTelemetry after Pydantic validation."""
+        assert isinstance(self.config, OTelConfig)  # Type hint for IDE/linter
+
+        # Warn if OTLP endpoints not configured
+        if not os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
+            if not os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
+                logger.warning("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set. Traces will not be exported.")
+            if not os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"):
+                logger.warning("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT not set. Metrics will not be exported.")
+
+        resource_attributes = {}
+        if self.config.service_name:
+            resource_attributes["service.name"] = self.config.service_name
+
+        # Create resource with service name
+        resource = Resource.create(resource_attributes)
+
+        # Configure the tracer provider (always, since llama stack run spawns subprocess without opentelemetry-instrument)
+        tracer_provider = TracerProvider(resource=resource)
+        trace.set_tracer_provider(tracer_provider)
+
+        # Configure OTLP span exporter
+        otlp_span_exporter = OTLPSpanExporter()
+        if self.config.span_processor == "batch":
+            tracer_provider.add_span_processor(BatchSpanProcessor(otlp_span_exporter))
+        else:
+            tracer_provider.add_span_processor(SimpleSpanProcessor(otlp_span_exporter))
+
+        # Configure meter provider with OTLP exporter for metrics
+        metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
+        meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
+        metrics.set_meter_provider(meter_provider)
+
+        logger.info("Initialized OpenTelemetry Instrumentation")
+        logger.debug(f"OpenTelemetry Instrumentation configuration: {self.config}")
+
+    def fastapi_middleware(self, app: FastAPI):
+        """Inject OpenTelemetry middleware into FastAPI."""
+        meter = metrics.get_meter("llama_stack.http.server")
+
+        # HTTP Metrics following OTel semantic conventions
+        # https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
+        request_duration = meter.create_histogram(
+            "http.server.request.duration",
+            unit="ms",
+            description="Duration of HTTP requests (time-to-first-byte for streaming)",
+        )
+
+        streaming_duration = meter.create_histogram(
+            "http.server.streaming.duration",
+            unit="ms",
+            description="Total duration of streaming responses (from start to stream completion)",
+        )
+
+        request_count = meter.create_counter(
+            "http.server.request.count", unit="requests", description="Total number of HTTP requests"
+        )
+
+        streaming_requests = meter.create_counter(
+            "http.server.streaming.count", unit="requests", description="Number of streaming requests"
+        )
+
+        # Hook to enrich spans and record initial metrics
+        def server_request_hook(span, scope):
+            """
+            Called by FastAPIInstrumentor for each request.
+
+            This only reads from scope (ASGI dict), never touches request body.
+            Safe to use without interfering with body parsing.
+            """
+            method = scope.get("method", "UNKNOWN")
+            path = scope.get("path", "/")
+
+            # Add custom attributes
+            span.set_attribute("service.component", "llama-stack-api")
+            span.set_attribute("http.request", path)
+            span.set_attribute("http.method", method)
+
+            attributes = {
+                "http.request": path,
+                "http.method": method,
+                "trace_id": span.attributes.get("trace_id", ""),
+                "span_id": span.attributes.get("span_id", ""),
+            }
+
+            request_count.add(1, attributes)
+            logger.debug(f"server_request_hook: recorded request_count for {method} {path}, attributes={attributes}")
+
+        # NOTE: This is called BEFORE routes are added to the app
+        # FastAPIInstrumentor.instrument_app() patches build_middleware_stack(),
+        # which will be called on first request (after routes are added), so hooks should work.
+        logger.debug("Instrumenting FastAPI (routes will be added later)")
+        FastAPIInstrumentor.instrument_app(
+            app,
+            server_request_hook=server_request_hook,
+        )
+        logger.debug(f"FastAPI instrumented: {getattr(app, '_is_instrumented_by_opentelemetry', False)}")
+
+        # Add pure ASGI middleware for streaming metrics (always add, regardless of instrumentation)
+        app.add_middleware(StreamingMetricsMiddleware)
+
+        # Add metrics span processor
+        provider = trace.get_tracer_provider()
+        if isinstance(provider, TracerProvider):
+            metrics_exporter = MetricsSpanExporter(
+                request_duration=request_duration,
+                streaming_duration=streaming_duration,
+                streaming_requests=streaming_requests,
+                request_count=request_count,
+            )
+            provider.add_span_processor(BatchSpanProcessor(metrics_exporter))
--- a/llama_stack/providers/inline/telemetry/otel/README.md
+++ b/llama_stack/providers/inline/telemetry/otel/README.md
@ -1,32 +0,0 @@
-# Open Telemetry Native Instrumentation
-
-This instrumentation package is simple, and follows expected open telemetry standards. It injects middleware for distributed tracing into all ingress and egress points into the application, and can be tuned and configured with OTEL environment variables.
-
-## Set Up
-
-First, bootstrap and install all necessary libraries for open telemtry:
-
-```
-uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
-```
-
-Make sure you export required environment variables for open telemetry:
-```
-export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
-export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
-```
-
-If you want certian endpoints to be ignored from the fast API telemetry, set the following environment variable:
-
-```
-export OTEL_PYTHON_FASTAPI_EXCLUDED_URLS="client/.*/info,healthcheck"
-```
-
-Finaly, run Llama Stack with automatic code injection:
-
-```
-uv run opentelemetry-instrument llama stack run --config myconfig.yaml
-```
-
-#### Open Telemetry Configuration Environment Variables
-Environment Variables: https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/
--- a/llama_stack/providers/inline/telemetry/otel/init.py
+++ b/llama_stack/providers/inline/telemetry/otel/init.py
@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .config import OTelTelemetryConfig
-
-__all__ = ["OTelTelemetryConfig"]
-
-
-async def get_provider_impl(config: OTelTelemetryConfig, deps):
-    """
-    Get the OTel telemetry provider implementation.
-
-    This function is called by the Llama Stack registry to instantiate
-    the provider.
-    """
-    from .otel import OTelTelemetryProvider
-
-    # The provider is synchronously initialized via Pydantic model_post_init
-    # No async initialization needed
-    return OTelTelemetryProvider(config=config)
--- a/llama_stack/providers/inline/telemetry/otel/config.py
+++ b/llama_stack/providers/inline/telemetry/otel/config.py
@ -1,50 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Literal
-
-from pydantic import BaseModel, Field
-
-type BatchSpanProcessor = Literal["batch"]
-type SimpleSpanProcessor = Literal["simple"]
-
-
-class OTelTelemetryConfig(BaseModel):
-    """
-    The configuration for the OpenTelemetry telemetry provider.
-    Most configuration is set using environment variables.
-    See https://opentelemetry.io/docs/specs/otel/configuration/sdk-configuration-variables/ for more information.
-    """
-
-    service_name: str = Field(
-        description="""The name of the service to be monitored.
-        Is overridden by the OTEL_SERVICE_NAME or OTEL_RESOURCE_ATTRIBUTES environment variables.""",
-    )
-    service_version: str | None = Field(
-        default=None,
-        description="""The version of the service to be monitored.
-        Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable.""",
-    )
-    deployment_environment: str | None = Field(
-        default=None,
-        description="""The name of the environment of the service to be monitored.
-        Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable.""",
-    )
-    span_processor: BatchSpanProcessor | SimpleSpanProcessor | None = Field(
-        description="""The span processor to use.
-        Is overriden by the OTEL_SPAN_PROCESSOR environment variable.""",
-        default="batch",
-    )
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str = "") -> dict[str, Any]:
-        """Sample configuration for use in distributions."""
-        return {
-            "service_name": "${env.OTEL_SERVICE_NAME:=llama-stack}",
-            "service_version": "${env.OTEL_SERVICE_VERSION:=}",
-            "deployment_environment": "${env.OTEL_DEPLOYMENT_ENVIRONMENT:=}",
-            "span_processor": "${env.OTEL_SPAN_PROCESSOR:=batch}",
-        }
--- a/llama_stack/providers/inline/telemetry/otel/otel.py
+++ b/llama_stack/providers/inline/telemetry/otel/otel.py
@ -1,301 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-import time
-
-from fastapi import FastAPI
-from opentelemetry import metrics, trace
-from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
-from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
-from opentelemetry.metrics import Counter, Histogram
-from opentelemetry.sdk.metrics import MeterProvider
-from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
-from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import (
-    BatchSpanProcessor,
-    SimpleSpanProcessor,
-    SpanExporter,
-    SpanExportResult,
-)
-from sqlalchemy import Engine
-from starlette.types import ASGIApp, Message, Receive, Scope, Send
-
-from llama_stack.core.telemetry.telemetry import TelemetryProvider
-from llama_stack.log import get_logger
-
-from .config import OTelTelemetryConfig
-
-logger = get_logger(name=__name__, category="telemetry::otel")
-
-
-class StreamingMetricsMiddleware:
-    """
-    Pure ASGI middleware to track streaming response metrics.
-
-    This follows Starlette best practices by implementing pure ASGI,
-    which is more efficient and less prone to bugs than BaseHTTPMiddleware.
-    """
-
-    def __init__(self, app: ASGIApp):
-        self.app = app
-
-    async def __call__(self, scope: Scope, receive: Receive, send: Send):
-        if scope["type"] != "http":
-            await self.app(scope, receive, send)
-            return
-
-        logger.debug(f"StreamingMetricsMiddleware called for {scope.get('method')} {scope.get('path')}")
-        start_time = time.time()
-
-        # Track if this is a streaming response
-        is_streaming = False
-
-        async def send_wrapper(message: Message):
-            nonlocal is_streaming
-
-            # Detect streaming responses by headers
-            if message["type"] == "http.response.start":
-                headers = message.get("headers", [])
-                for name, value in headers:
-                    if name == b"content-type" and b"text/event-stream" in value:
-                        is_streaming = True
-                        # Add streaming attribute to current span
-                        current_span = trace.get_current_span()
-                        if current_span and current_span.is_recording():
-                            current_span.set_attribute("http.response.is_streaming", True)
-                        break
-
-            # Record total duration when response body completes
-            elif message["type"] == "http.response.body" and not message.get("more_body", False):
-                if is_streaming:
-                    current_span = trace.get_current_span()
-                    if current_span and current_span.is_recording():
-                        total_duration_ms = (time.time() - start_time) * 1000
-                        current_span.set_attribute("http.streaming.total_duration_ms", total_duration_ms)
-
-            await send(message)
-
-        await self.app(scope, receive, send_wrapper)
-
-
-class MetricsSpanExporter(SpanExporter):
-    """Records HTTP metrics from span data."""
-
-    def __init__(
-        self,
-        request_duration: Histogram,
-        streaming_duration: Histogram,
-        streaming_requests: Counter,
-        request_count: Counter,
-    ):
-        self.request_duration = request_duration
-        self.streaming_duration = streaming_duration
-        self.streaming_requests = streaming_requests
-        self.request_count = request_count
-
-    def export(self, spans):
-        logger.debug(f"MetricsSpanExporter.export called with {len(spans)} spans")
-        for span in spans:
-            if not span.attributes or not span.attributes.get("http.method"):
-                continue
-            logger.debug(f"Processing span: {span.name}")
-
-            if span.end_time is None or span.start_time is None:
-                continue
-
-            # Calculate time-to-first-byte duration
-            duration_ns = span.end_time - span.start_time
-            duration_ms = duration_ns / 1_000_000
-
-            # Check if this was a streaming response
-            is_streaming = span.attributes.get("http.response.is_streaming", False)
-
-            attributes = {
-                "http.method": str(span.attributes.get("http.method", "UNKNOWN")),
-                "http.route": str(span.attributes.get("http.route", span.attributes.get("http.target", "/"))),
-                "http.status_code": str(span.attributes.get("http.status_code", 0)),
-            }
-
-            # set distributed trace attributes
-            if span.attributes.get("trace_id"):
-                attributes["trace_id"] = str(span.attributes.get("trace_id"))
-            if span.attributes.get("span_id"):
-                attributes["span_id"] = str(span.attributes.get("span_id"))
-
-            # Record request count and duration
-            logger.debug(f"Recording metrics: duration={duration_ms}ms, attributes={attributes}")
-            self.request_count.add(1, attributes)
-            self.request_duration.record(duration_ms, attributes)
-            logger.debug("Metrics recorded successfully")
-
-            # For streaming, record separately
-            if is_streaming:
-                logger.debug(f"MetricsSpanExporter: Recording streaming metrics for {span.name}")
-                self.streaming_requests.add(1, attributes)
-
-                # If full streaming duration is available
-                stream_total_duration = span.attributes.get("http.streaming.total_duration_ms")
-                if stream_total_duration and isinstance(stream_total_duration, int | float):
-                    logger.debug(f"MetricsSpanExporter: Recording streaming duration: {stream_total_duration}ms")
-                    self.streaming_duration.record(float(stream_total_duration), attributes)
-                else:
-                    logger.warning(
-                        "MetricsSpanExporter: Streaming span has no http.streaming.total_duration_ms attribute"
-                    )
-
-        return SpanExportResult.SUCCESS
-
-    def shutdown(self):
-        pass
-
-
-# NOTE: DO NOT ALLOW LLM TO MODIFY THIS WITHOUT TESTING AND SUPERVISION: it frequently breaks otel integrations
-class OTelTelemetryProvider(TelemetryProvider):
-    """
-    A simple Open Telemetry native telemetry provider.
-    """
-
-    config: OTelTelemetryConfig
-
-    def model_post_init(self, __context):
-        """Initialize provider after Pydantic validation."""
-
-        # Do not fail the application, but warn the user if the endpoints are not set properly.
-        if not os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
-            if not os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
-                logger.warning(
-                    "OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is not set. Traces will not be exported."
-                )
-            if not os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"):
-                logger.warning(
-                    "OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_METRICS_ENDPOINT is not set. Metrics will not be exported."
-                )
-
-        # Respect OTEL design standards where environment variables get highest precedence
-        service_name = os.environ.get("OTEL_SERVICE_NAME")
-        if not service_name:
-            service_name = self.config.service_name
-
-        # Create resource with service name
-        resource = Resource.create({"service.name": service_name})
-
-        # Configure the tracer provider (always, since llama stack run spawns subprocess without opentelemetry-instrument)
-        tracer_provider = TracerProvider(resource=resource)
-        trace.set_tracer_provider(tracer_provider)
-
-        # Configure OTLP span exporter
-        otlp_span_exporter = OTLPSpanExporter()
-
-        # Add span processor (simple for immediate export, batch for performance)
-        span_processor_type = os.environ.get("OTEL_SPAN_PROCESSOR", "batch")
-        if span_processor_type == "batch":
-            tracer_provider.add_span_processor(BatchSpanProcessor(otlp_span_exporter))
-        else:
-            tracer_provider.add_span_processor(SimpleSpanProcessor(otlp_span_exporter))
-
-        # Configure meter provider with OTLP exporter for metrics
-        metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
-        meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
-        metrics.set_meter_provider(meter_provider)
-
-        logger.info(
-            f"Initialized OpenTelemetry provider with service.name={service_name}, span_processor={span_processor_type}"
-        )
-
-    def fastapi_middleware(self, app: FastAPI):
-        """
-        Instrument FastAPI with OTel for automatic tracing and metrics.
-
-        Captures telemetry for both regular and streaming HTTP requests:
-        - Distributed traces (via FastAPIInstrumentor)
-        - HTTP request metrics (count, duration, status)
-        - Streaming-specific metrics (time-to-first-byte, total stream duration)
-        """
-
-        # Create meter for HTTP metrics
-        meter = metrics.get_meter("llama_stack.http.server")
-
-        # HTTP Metrics following OTel semantic conventions
-        # https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
-        request_duration = meter.create_histogram(
-            "http.server.request.duration",
-            unit="ms",
-            description="Duration of HTTP requests (time-to-first-byte for streaming)",
-        )
-
-        streaming_duration = meter.create_histogram(
-            "http.server.streaming.duration",
-            unit="ms",
-            description="Total duration of streaming responses (from start to stream completion)",
-        )
-
-        request_count = meter.create_counter(
-            "http.server.request.count", unit="requests", description="Total number of HTTP requests"
-        )
-
-        streaming_requests = meter.create_counter(
-            "http.server.streaming.count", unit="requests", description="Number of streaming requests"
-        )
-
-        # Hook to enrich spans and record initial metrics
-        def server_request_hook(span, scope):
-            """
-            Called by FastAPIInstrumentor for each request.
-
-            This only reads from scope (ASGI dict), never touches request body.
-            Safe to use without interfering with body parsing.
-            """
-            method = scope.get("method", "UNKNOWN")
-            path = scope.get("path", "/")
-
-            # Add custom attributes
-            span.set_attribute("service.component", "llama-stack-api")
-            span.set_attribute("http.request", path)
-            span.set_attribute("http.method", method)
-
-            attributes = {
-                "http.request": path,
-                "http.method": method,
-                "trace_id": span.attributes.get("trace_id", ""),
-                "span_id": span.attributes.get("span_id", ""),
-            }
-
-            request_count.add(1, attributes)
-            logger.debug(f"server_request_hook: recorded request_count for {method} {path}, attributes={attributes}")
-
-        # NOTE: This is called BEFORE routes are added to the app
-        # FastAPIInstrumentor.instrument_app() patches build_middleware_stack(),
-        # which will be called on first request (after routes are added), so hooks should work.
-        logger.debug("Instrumenting FastAPI (routes will be added later)")
-        FastAPIInstrumentor.instrument_app(
-            app,
-            server_request_hook=server_request_hook,
-        )
-        logger.debug(f"FastAPI instrumented: {getattr(app, '_is_instrumented_by_opentelemetry', False)}")
-
-        # Add pure ASGI middleware for streaming metrics (always add, regardless of instrumentation)
-        app.add_middleware(StreamingMetricsMiddleware)
-
-        # Add metrics span processor
-        provider = trace.get_tracer_provider()
-        logger.debug(f"TracerProvider: {provider}")
-        if isinstance(provider, TracerProvider):
-            metrics_exporter = MetricsSpanExporter(
-                request_duration=request_duration,
-                streaming_duration=streaming_duration,
-                streaming_requests=streaming_requests,
-                request_count=request_count,
-            )
-            provider.add_span_processor(BatchSpanProcessor(metrics_exporter))
-            logger.debug("Added MetricsSpanExporter as BatchSpanProcessor")
-        else:
-            logger.warning(
-                f"TracerProvider is not TracerProvider instance, it's {type(provider)}. MetricsSpanExporter not added."
-            )
--- a/llama_stack/providers/registry/instrumentation.py
+++ b/llama_stack/providers/registry/instrumentation.py
@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Registry for instrumentation providers (non-API providers).
+
+This registry is string-based to avoid importing provider modules at import
+ time (prevents circular imports). Classes are instantiated lazily by the
+ StackRunConfig validator using `instantiate_class_type`.
+
+Please implement your instrumentation provider as a subclass of `InstrumentationProvider` and register it in this registry.
+
+Example:
+
+```
+from llama_stack.core.instrumentation import InstrumentationProvider
+
+class MyInstrumentationProvider(InstrumentationProvider):
+    fastapi_middleware(self, app: FastAPI) -> None:
+        pass
+```
+"""
+
+from typing import NamedTuple
+
+
+class InstrumentationEntry(NamedTuple):
+    provider_class: str  # fully-qualified class path
+    config_class: str  # fully-qualified class path
+    description: str
+
+
+instrumentation_registry: dict[str, InstrumentationEntry] = {
+    "otel": InstrumentationEntry(
+        provider_class="llama_stack.providers.inline.instrumentation.otel.otel.OTelInstrumentationProvider",
+        config_class="llama_stack.providers.inline.instrumentation.otel.config.OTelConfig",
+        description="OpenTelemetry instrumentation",
+    ),
+}
--- a/llama_stack/providers/registry/telemetry.py
+++ b/llama_stack/providers/registry/telemetry.py
@ -26,16 +26,4 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig",
            description="Meta's reference implementation of telemetry and observability using OpenTelemetry.",
        ),
-        InlineProviderSpec(
-            api=Api.telemetry,
-            provider_type="inline::otel",
-            pip_packages=[
-                "opentelemetry-sdk",
-                "opentelemetry-exporter-otlp-proto-http",
-                "opentelemetry-instrumentation-fastapi",
-            ],
-            module="llama_stack.providers.inline.telemetry.otel",
-            config_class="llama_stack.providers.inline.telemetry.otel.config.OTelTelemetryConfig",
-            description="Native OpenTelemetry provider with full access to OTel Tracer and Meter APIs for advanced instrumentation.",
-        ),
    ]
--- a/tests/integration/instrumentation/init.py
+++ b/tests/integration/instrumentation/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/tests/integration/instrumentation/mocking/README.md
+++ b/tests/integration/instrumentation/mocking/README.md
--- a/tests/integration/instrumentation/mocking/init.py
+++ b/tests/integration/instrumentation/mocking/init.py
--- a/tests/integration/instrumentation/mocking/harness.py
+++ b/tests/integration/instrumentation/mocking/harness.py
--- a/tests/integration/instrumentation/mocking/mock_base.py
+++ b/tests/integration/instrumentation/mocking/mock_base.py
--- a/tests/integration/instrumentation/mocking/servers.py
+++ b/tests/integration/instrumentation/mocking/servers.py
--- a/tests/integration/instrumentation/test_otel_e2e.py
+++ b/tests/integration/instrumentation/test_otel_e2e.py
@ -395,7 +395,6 @@ def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
        pytest.fail(f"Mock vLLM not accessible before starting Llama Stack: {e}")

    # Create run.yaml with inference and telemetry providers
-    # **TO ADD MORE PROVIDERS:** Add to providers dict
    run_config = {
        "image_name": "test-otel-e2e",
        "apis": ["inference"],
@ -409,16 +408,16 @@ def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
                    },
                },
            ],
-            "telemetry": [
-                {
-                    "provider_id": "otel",
-                    "provider_type": "inline::otel",
-                    "config": {
-                        "service_name": "llama-stack-e2e-test",
-                        "span_processor": "simple",
-                    },
-                },
-            ],
+        },
+        "instrumentation": {
+            "provider": "otel",  # Discriminator for Pydantic
+            "config": {
+                "service_name": "llama-stack-e2e-test",
+                "span_processor": "simple",
+            },
+        },
+        "server": {
+            "host": "127.0.0.1",
        },
        "models": [
            {
@ -485,7 +484,7 @@ def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):

    # Wait for server to start
    max_wait = 30
-    base_url = f"http://localhost:{port}"
+    base_url = f"http://127.0.0.1:{port}"
    startup_output = []

    for i in range(max_wait):