fix(major::pr): re-architect instrumentation library

This commit is contained in:
Emilio Garcia 2025-10-06 17:54:05 -04:00
parent 7e3cf1fb20
commit 8fe3a25158
21 changed files with 422 additions and 462 deletions

View file

@ -9,7 +9,7 @@ from pathlib import Path
from typing import Annotated, Any, Literal, Self
from urllib.parse import urlparse
from pydantic import BaseModel, Field, field_validator, model_validator
from pydantic import BaseModel, Field, TypeAdapter, field_validator, model_validator
from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
from llama_stack.apis.datasetio import DatasetIO
@ -26,7 +26,10 @@ from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
from llama_stack.apis.vector_io import VectorIO
from llama_stack.core.access_control.datatypes import AccessRule
from llama_stack.core.instrumentation import InstrumentationProvider
from llama_stack.core.utils.dynamic import instantiate_class_type
from llama_stack.providers.datatypes import Api, ProviderSpec
from llama_stack.providers.registry.instrumentation import instrumentation_registry
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
from llama_stack.providers.utils.sqlstore.sqlstore import SqlStoreConfig
@ -493,6 +496,12 @@ If not specified, a default SQLite store will be used.""",
logging: LoggingConfig | None = Field(default=None, description="Configuration for Llama Stack Logging")
# Middleware/instrumentation providers (not full APIs)
instrumentation: InstrumentationProvider | None = Field(
default=None,
description="Instrumentation provider for observability",
)
server: ServerConfig = Field(
default_factory=ServerConfig,
description="Configuration for the HTTP(S) server",
@ -517,11 +526,31 @@ If not specified, a default SQLite store will be used.""",
return Path(v)
return v
@field_validator("instrumentation", mode="before")
@classmethod
def load_instrumentation(cls, v: InstrumentationProvider | dict[str, Any] | None):
if v is None or isinstance(v, InstrumentationProvider):
return v
provider_type = v.get("provider")
if not isinstance(provider_type, str):
raise ValueError("instrumentation.provider must be a string")
entry = instrumentation_registry.get(provider_type)
if entry is None:
raise ValueError(f"Unknown instrumentation provider: {provider_type}")
cfg_cls = instantiate_class_type(entry.config_class)
prv_cls = instantiate_class_type(entry.provider_class)
cfg_data = v.get("config") or {}
cfg = TypeAdapter(cfg_cls).validate_python(cfg_data)
return prv_cls(provider=provider_type, config=cfg)
class BuildConfig(BaseModel):
version: int = LLAMA_STACK_BUILD_CONFIG_VERSION
distribution_spec: DistributionSpec = Field(description="The distribution spec to build including API providers. ")
distribution_spec: DistributionSpec = Field(description="The distribution spec to build including API providers.")
image_type: str = Field(
default="venv",
description="Type of package to build (container | venv)",

View file

@ -0,0 +1,33 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""Protocol for instrumentation providers."""
from abc import abstractmethod
from fastapi import FastAPI
from pydantic import BaseModel, Field
class InstrumentationProvider(BaseModel):
"""
Base class for instrumentation providers.
Instrumentation providers add observability (tracing, metrics, logs) to the
application but don't expose API endpoints.
"""
provider: str = Field(description="Provider identifier for discriminated unions")
config: BaseModel
@abstractmethod
def fastapi_middleware(self, app: FastAPI) -> None:
"""
Inject middleware into the FastAPI application.
:param app: The FastAPI application to instrument
"""
...

View file

@ -400,9 +400,9 @@ def create_app() -> StackApp:
if cors_config:
app.add_middleware(CORSMiddleware, **cors_config.model_dump())
if Api.telemetry in impls:
impls[Api.telemetry].fastapi_middleware(app)
impls[Api.telemetry].sqlalchemy_instrumentation()
# Apply instrumentation provider (e.g., OpenTelemetry)
if config.instrumentation:
config.instrumentation.fastapi_middleware(app)
# Load external APIs if configured
external_apis = load_external_apis(config)

View file

@ -1,22 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from abc import abstractmethod
from fastapi import FastAPI
from pydantic import BaseModel
class TelemetryProvider(BaseModel):
"""
TelemetryProvider standardizes how telemetry is provided to the application.
"""
@abstractmethod
def fastapi_middleware(self, app: FastAPI, *args, **kwargs):
"""
Injects FastAPI middleware that instruments the application for telemetry.
"""
...

View file

@ -3,8 +3,3 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,27 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Literal
from pydantic import BaseModel, Field
class OTelConfig(BaseModel):
"""
OpenTelemetry instrumentation configuration.
Most OTel settings use environment variables (OTEL_*).
See: https://opentelemetry.io/docs/specs/otel/configuration/sdk-configuration-variables/
"""
service_name: str | None = Field(
default=None,
description="Service name (overridden by OTEL_SERVICE_NAME env var)",
)
span_processor: Literal["batch", "simple"] = Field(
default="batch",
description="Span processor type (overridden by OTEL_SPAN_PROCESSOR env var)",
)

View file

@ -0,0 +1,123 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import time
from opentelemetry import trace
from opentelemetry.metrics import Counter, Histogram
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
from starlette.types import ASGIApp, Message, Receive, Scope, Send
from llama_stack.log import get_logger
logger = get_logger(name=__name__, category="instrumentation::otel")
class StreamingMetricsMiddleware:
"""
ASGI middleware to track streaming response metrics.
:param app: The ASGI app to wrap
"""
def __init__(self, app: ASGIApp):
self.app = app
async def __call__(self, scope: Scope, receive: Receive, send: Send):
if scope["type"] != "http":
await self.app(scope, receive, send)
return
logger.debug(f"StreamingMetricsMiddleware called for {scope.get('method')} {scope.get('path')}")
start_time = time.time()
is_streaming = False
async def send_wrapper(message: Message):
nonlocal is_streaming
# Detect streaming responses by headers
if message["type"] == "http.response.start":
headers = message.get("headers", [])
for name, value in headers:
if name == b"content-type" and b"text/event-stream" in value:
is_streaming = True
# Add streaming attribute to current span
current_span = trace.get_current_span()
if current_span and current_span.is_recording():
current_span.set_attribute("http.response.is_streaming", True)
break
# Record total duration when response body completes
elif message["type"] == "http.response.body" and not message.get("more_body", False):
if is_streaming:
current_span = trace.get_current_span()
if current_span and current_span.is_recording():
total_duration_ms = (time.time() - start_time) * 1000
current_span.set_attribute("http.streaming.total_duration_ms", total_duration_ms)
await send(message)
await self.app(scope, receive, send_wrapper)
class MetricsSpanExporter(SpanExporter):
"""
Records additional custom HTTP metrics during otel span export.
:param request_duration: Histogram to record request duration
:param streaming_duration: Histogram to record streaming duration
:param streaming_requests: Counter to record streaming requests
:param request_count: Counter to record request count
"""
def __init__(
self,
request_duration: Histogram,
streaming_duration: Histogram,
streaming_requests: Counter,
request_count: Counter,
):
self.request_duration = request_duration
self.streaming_duration = streaming_duration
self.streaming_requests = streaming_requests
self.request_count = request_count
def export(self, spans):
for span in spans:
if not span.attributes or not span.attributes.get("http.method"):
continue
logger.debug(f"Processing span: {span.name}")
if span.end_time is None or span.start_time is None:
continue
duration_ms = (span.end_time - span.start_time) / 1_000_000
is_streaming = span.attributes.get("http.response.is_streaming", False)
attributes = {
"http.method": str(span.attributes.get("http.method", "UNKNOWN")),
"http.route": str(span.attributes.get("http.route", span.attributes.get("http.target", "/"))),
"http.status_code": str(span.attributes.get("http.status_code", 0)),
"trace_id": str(span.attributes.get("trace_id", "")),
"span_id": str(span.attributes.get("span_id", "")),
}
# Record request count and duration
logger.debug(f"Recording metrics: duration={duration_ms}ms, attributes={attributes}")
self.request_count.add(1, attributes)
self.request_duration.record(duration_ms, attributes)
if is_streaming:
logger.debug(f"MetricsSpanExporter: Recording streaming metrics for {span.name}")
self.streaming_requests.add(1, attributes)
stream_duration = span.attributes.get("http.streaming.total_duration_ms")
if stream_duration and isinstance(stream_duration, (int | float)):
self.streaming_duration.record(float(stream_duration), attributes)
return SpanExportResult.SUCCESS
def shutdown(self):
pass

View file

@ -0,0 +1,148 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
from fastapi import FastAPI
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
BatchSpanProcessor,
SimpleSpanProcessor,
)
from llama_stack.core.instrumentation import InstrumentationProvider
from llama_stack.log import get_logger
from .config import OTelConfig
from .middleware import MetricsSpanExporter, StreamingMetricsMiddleware
logger = get_logger(name=__name__, category="instrumentation::otel")
class OTelInstrumentationProvider(InstrumentationProvider):
"""OpenTelemetry instrumentation provider."""
provider: str = "otel" # Discriminator value
def model_post_init(self, __context):
"""Initialize OpenTelemetry after Pydantic validation."""
assert isinstance(self.config, OTelConfig) # Type hint for IDE/linter
# Warn if OTLP endpoints not configured
if not os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
if not os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
logger.warning("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set. Traces will not be exported.")
if not os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"):
logger.warning("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT not set. Metrics will not be exported.")
resource_attributes = {}
if self.config.service_name:
resource_attributes["service.name"] = self.config.service_name
# Create resource with service name
resource = Resource.create(resource_attributes)
# Configure the tracer provider (always, since llama stack run spawns subprocess without opentelemetry-instrument)
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP span exporter
otlp_span_exporter = OTLPSpanExporter()
if self.config.span_processor == "batch":
tracer_provider.add_span_processor(BatchSpanProcessor(otlp_span_exporter))
else:
tracer_provider.add_span_processor(SimpleSpanProcessor(otlp_span_exporter))
# Configure meter provider with OTLP exporter for metrics
metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
logger.info("Initialized OpenTelemetry Instrumentation")
logger.debug(f"OpenTelemetry Instrumentation configuration: {self.config}")
def fastapi_middleware(self, app: FastAPI):
"""Inject OpenTelemetry middleware into FastAPI."""
meter = metrics.get_meter("llama_stack.http.server")
# HTTP Metrics following OTel semantic conventions
# https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
request_duration = meter.create_histogram(
"http.server.request.duration",
unit="ms",
description="Duration of HTTP requests (time-to-first-byte for streaming)",
)
streaming_duration = meter.create_histogram(
"http.server.streaming.duration",
unit="ms",
description="Total duration of streaming responses (from start to stream completion)",
)
request_count = meter.create_counter(
"http.server.request.count", unit="requests", description="Total number of HTTP requests"
)
streaming_requests = meter.create_counter(
"http.server.streaming.count", unit="requests", description="Number of streaming requests"
)
# Hook to enrich spans and record initial metrics
def server_request_hook(span, scope):
"""
Called by FastAPIInstrumentor for each request.
This only reads from scope (ASGI dict), never touches request body.
Safe to use without interfering with body parsing.
"""
method = scope.get("method", "UNKNOWN")
path = scope.get("path", "/")
# Add custom attributes
span.set_attribute("service.component", "llama-stack-api")
span.set_attribute("http.request", path)
span.set_attribute("http.method", method)
attributes = {
"http.request": path,
"http.method": method,
"trace_id": span.attributes.get("trace_id", ""),
"span_id": span.attributes.get("span_id", ""),
}
request_count.add(1, attributes)
logger.debug(f"server_request_hook: recorded request_count for {method} {path}, attributes={attributes}")
# NOTE: This is called BEFORE routes are added to the app
# FastAPIInstrumentor.instrument_app() patches build_middleware_stack(),
# which will be called on first request (after routes are added), so hooks should work.
logger.debug("Instrumenting FastAPI (routes will be added later)")
FastAPIInstrumentor.instrument_app(
app,
server_request_hook=server_request_hook,
)
logger.debug(f"FastAPI instrumented: {getattr(app, '_is_instrumented_by_opentelemetry', False)}")
# Add pure ASGI middleware for streaming metrics (always add, regardless of instrumentation)
app.add_middleware(StreamingMetricsMiddleware)
# Add metrics span processor
provider = trace.get_tracer_provider()
if isinstance(provider, TracerProvider):
metrics_exporter = MetricsSpanExporter(
request_duration=request_duration,
streaming_duration=streaming_duration,
streaming_requests=streaming_requests,
request_count=request_count,
)
provider.add_span_processor(BatchSpanProcessor(metrics_exporter))

View file

@ -1,32 +0,0 @@
# Open Telemetry Native Instrumentation
This instrumentation package is simple, and follows expected open telemetry standards. It injects middleware for distributed tracing into all ingress and egress points into the application, and can be tuned and configured with OTEL environment variables.
## Set Up
First, bootstrap and install all necessary libraries for open telemtry:
```
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
```
Make sure you export required environment variables for open telemetry:
```
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
```
If you want certian endpoints to be ignored from the fast API telemetry, set the following environment variable:
```
export OTEL_PYTHON_FASTAPI_EXCLUDED_URLS="client/.*/info,healthcheck"
```
Finaly, run Llama Stack with automatic code injection:
```
uv run opentelemetry-instrument llama stack run --config myconfig.yaml
```
#### Open Telemetry Configuration Environment Variables
Environment Variables: https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/

View file

@ -1,23 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .config import OTelTelemetryConfig
__all__ = ["OTelTelemetryConfig"]
async def get_provider_impl(config: OTelTelemetryConfig, deps):
"""
Get the OTel telemetry provider implementation.
This function is called by the Llama Stack registry to instantiate
the provider.
"""
from .otel import OTelTelemetryProvider
# The provider is synchronously initialized via Pydantic model_post_init
# No async initialization needed
return OTelTelemetryProvider(config=config)

View file

@ -1,50 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Literal
from pydantic import BaseModel, Field
type BatchSpanProcessor = Literal["batch"]
type SimpleSpanProcessor = Literal["simple"]
class OTelTelemetryConfig(BaseModel):
"""
The configuration for the OpenTelemetry telemetry provider.
Most configuration is set using environment variables.
See https://opentelemetry.io/docs/specs/otel/configuration/sdk-configuration-variables/ for more information.
"""
service_name: str = Field(
description="""The name of the service to be monitored.
Is overridden by the OTEL_SERVICE_NAME or OTEL_RESOURCE_ATTRIBUTES environment variables.""",
)
service_version: str | None = Field(
default=None,
description="""The version of the service to be monitored.
Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable.""",
)
deployment_environment: str | None = Field(
default=None,
description="""The name of the environment of the service to be monitored.
Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable.""",
)
span_processor: BatchSpanProcessor | SimpleSpanProcessor | None = Field(
description="""The span processor to use.
Is overriden by the OTEL_SPAN_PROCESSOR environment variable.""",
default="batch",
)
@classmethod
def sample_run_config(cls, __distro_dir__: str = "") -> dict[str, Any]:
"""Sample configuration for use in distributions."""
return {
"service_name": "${env.OTEL_SERVICE_NAME:=llama-stack}",
"service_version": "${env.OTEL_SERVICE_VERSION:=}",
"deployment_environment": "${env.OTEL_DEPLOYMENT_ENVIRONMENT:=}",
"span_processor": "${env.OTEL_SPAN_PROCESSOR:=batch}",
}

View file

@ -1,301 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import time
from fastapi import FastAPI
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.metrics import Counter, Histogram
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
BatchSpanProcessor,
SimpleSpanProcessor,
SpanExporter,
SpanExportResult,
)
from sqlalchemy import Engine
from starlette.types import ASGIApp, Message, Receive, Scope, Send
from llama_stack.core.telemetry.telemetry import TelemetryProvider
from llama_stack.log import get_logger
from .config import OTelTelemetryConfig
logger = get_logger(name=__name__, category="telemetry::otel")
class StreamingMetricsMiddleware:
"""
Pure ASGI middleware to track streaming response metrics.
This follows Starlette best practices by implementing pure ASGI,
which is more efficient and less prone to bugs than BaseHTTPMiddleware.
"""
def __init__(self, app: ASGIApp):
self.app = app
async def __call__(self, scope: Scope, receive: Receive, send: Send):
if scope["type"] != "http":
await self.app(scope, receive, send)
return
logger.debug(f"StreamingMetricsMiddleware called for {scope.get('method')} {scope.get('path')}")
start_time = time.time()
# Track if this is a streaming response
is_streaming = False
async def send_wrapper(message: Message):
nonlocal is_streaming
# Detect streaming responses by headers
if message["type"] == "http.response.start":
headers = message.get("headers", [])
for name, value in headers:
if name == b"content-type" and b"text/event-stream" in value:
is_streaming = True
# Add streaming attribute to current span
current_span = trace.get_current_span()
if current_span and current_span.is_recording():
current_span.set_attribute("http.response.is_streaming", True)
break
# Record total duration when response body completes
elif message["type"] == "http.response.body" and not message.get("more_body", False):
if is_streaming:
current_span = trace.get_current_span()
if current_span and current_span.is_recording():
total_duration_ms = (time.time() - start_time) * 1000
current_span.set_attribute("http.streaming.total_duration_ms", total_duration_ms)
await send(message)
await self.app(scope, receive, send_wrapper)
class MetricsSpanExporter(SpanExporter):
"""Records HTTP metrics from span data."""
def __init__(
self,
request_duration: Histogram,
streaming_duration: Histogram,
streaming_requests: Counter,
request_count: Counter,
):
self.request_duration = request_duration
self.streaming_duration = streaming_duration
self.streaming_requests = streaming_requests
self.request_count = request_count
def export(self, spans):
logger.debug(f"MetricsSpanExporter.export called with {len(spans)} spans")
for span in spans:
if not span.attributes or not span.attributes.get("http.method"):
continue
logger.debug(f"Processing span: {span.name}")
if span.end_time is None or span.start_time is None:
continue
# Calculate time-to-first-byte duration
duration_ns = span.end_time - span.start_time
duration_ms = duration_ns / 1_000_000
# Check if this was a streaming response
is_streaming = span.attributes.get("http.response.is_streaming", False)
attributes = {
"http.method": str(span.attributes.get("http.method", "UNKNOWN")),
"http.route": str(span.attributes.get("http.route", span.attributes.get("http.target", "/"))),
"http.status_code": str(span.attributes.get("http.status_code", 0)),
}
# set distributed trace attributes
if span.attributes.get("trace_id"):
attributes["trace_id"] = str(span.attributes.get("trace_id"))
if span.attributes.get("span_id"):
attributes["span_id"] = str(span.attributes.get("span_id"))
# Record request count and duration
logger.debug(f"Recording metrics: duration={duration_ms}ms, attributes={attributes}")
self.request_count.add(1, attributes)
self.request_duration.record(duration_ms, attributes)
logger.debug("Metrics recorded successfully")
# For streaming, record separately
if is_streaming:
logger.debug(f"MetricsSpanExporter: Recording streaming metrics for {span.name}")
self.streaming_requests.add(1, attributes)
# If full streaming duration is available
stream_total_duration = span.attributes.get("http.streaming.total_duration_ms")
if stream_total_duration and isinstance(stream_total_duration, int | float):
logger.debug(f"MetricsSpanExporter: Recording streaming duration: {stream_total_duration}ms")
self.streaming_duration.record(float(stream_total_duration), attributes)
else:
logger.warning(
"MetricsSpanExporter: Streaming span has no http.streaming.total_duration_ms attribute"
)
return SpanExportResult.SUCCESS
def shutdown(self):
pass
# NOTE: DO NOT ALLOW LLM TO MODIFY THIS WITHOUT TESTING AND SUPERVISION: it frequently breaks otel integrations
class OTelTelemetryProvider(TelemetryProvider):
"""
A simple Open Telemetry native telemetry provider.
"""
config: OTelTelemetryConfig
def model_post_init(self, __context):
"""Initialize provider after Pydantic validation."""
# Do not fail the application, but warn the user if the endpoints are not set properly.
if not os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
if not os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
logger.warning(
"OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is not set. Traces will not be exported."
)
if not os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"):
logger.warning(
"OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_METRICS_ENDPOINT is not set. Metrics will not be exported."
)
# Respect OTEL design standards where environment variables get highest precedence
service_name = os.environ.get("OTEL_SERVICE_NAME")
if not service_name:
service_name = self.config.service_name
# Create resource with service name
resource = Resource.create({"service.name": service_name})
# Configure the tracer provider (always, since llama stack run spawns subprocess without opentelemetry-instrument)
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP span exporter
otlp_span_exporter = OTLPSpanExporter()
# Add span processor (simple for immediate export, batch for performance)
span_processor_type = os.environ.get("OTEL_SPAN_PROCESSOR", "batch")
if span_processor_type == "batch":
tracer_provider.add_span_processor(BatchSpanProcessor(otlp_span_exporter))
else:
tracer_provider.add_span_processor(SimpleSpanProcessor(otlp_span_exporter))
# Configure meter provider with OTLP exporter for metrics
metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
logger.info(
f"Initialized OpenTelemetry provider with service.name={service_name}, span_processor={span_processor_type}"
)
def fastapi_middleware(self, app: FastAPI):
"""
Instrument FastAPI with OTel for automatic tracing and metrics.
Captures telemetry for both regular and streaming HTTP requests:
- Distributed traces (via FastAPIInstrumentor)
- HTTP request metrics (count, duration, status)
- Streaming-specific metrics (time-to-first-byte, total stream duration)
"""
# Create meter for HTTP metrics
meter = metrics.get_meter("llama_stack.http.server")
# HTTP Metrics following OTel semantic conventions
# https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
request_duration = meter.create_histogram(
"http.server.request.duration",
unit="ms",
description="Duration of HTTP requests (time-to-first-byte for streaming)",
)
streaming_duration = meter.create_histogram(
"http.server.streaming.duration",
unit="ms",
description="Total duration of streaming responses (from start to stream completion)",
)
request_count = meter.create_counter(
"http.server.request.count", unit="requests", description="Total number of HTTP requests"
)
streaming_requests = meter.create_counter(
"http.server.streaming.count", unit="requests", description="Number of streaming requests"
)
# Hook to enrich spans and record initial metrics
def server_request_hook(span, scope):
"""
Called by FastAPIInstrumentor for each request.
This only reads from scope (ASGI dict), never touches request body.
Safe to use without interfering with body parsing.
"""
method = scope.get("method", "UNKNOWN")
path = scope.get("path", "/")
# Add custom attributes
span.set_attribute("service.component", "llama-stack-api")
span.set_attribute("http.request", path)
span.set_attribute("http.method", method)
attributes = {
"http.request": path,
"http.method": method,
"trace_id": span.attributes.get("trace_id", ""),
"span_id": span.attributes.get("span_id", ""),
}
request_count.add(1, attributes)
logger.debug(f"server_request_hook: recorded request_count for {method} {path}, attributes={attributes}")
# NOTE: This is called BEFORE routes are added to the app
# FastAPIInstrumentor.instrument_app() patches build_middleware_stack(),
# which will be called on first request (after routes are added), so hooks should work.
logger.debug("Instrumenting FastAPI (routes will be added later)")
FastAPIInstrumentor.instrument_app(
app,
server_request_hook=server_request_hook,
)
logger.debug(f"FastAPI instrumented: {getattr(app, '_is_instrumented_by_opentelemetry', False)}")
# Add pure ASGI middleware for streaming metrics (always add, regardless of instrumentation)
app.add_middleware(StreamingMetricsMiddleware)
# Add metrics span processor
provider = trace.get_tracer_provider()
logger.debug(f"TracerProvider: {provider}")
if isinstance(provider, TracerProvider):
metrics_exporter = MetricsSpanExporter(
request_duration=request_duration,
streaming_duration=streaming_duration,
streaming_requests=streaming_requests,
request_count=request_count,
)
provider.add_span_processor(BatchSpanProcessor(metrics_exporter))
logger.debug("Added MetricsSpanExporter as BatchSpanProcessor")
else:
logger.warning(
f"TracerProvider is not TracerProvider instance, it's {type(provider)}. MetricsSpanExporter not added."
)

View file

@ -0,0 +1,41 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""Registry for instrumentation providers (non-API providers).
This registry is string-based to avoid importing provider modules at import
time (prevents circular imports). Classes are instantiated lazily by the
StackRunConfig validator using `instantiate_class_type`.
Please implement your instrumentation provider as a subclass of `InstrumentationProvider` and register it in this registry.
Example:
```
from llama_stack.core.instrumentation import InstrumentationProvider
class MyInstrumentationProvider(InstrumentationProvider):
fastapi_middleware(self, app: FastAPI) -> None:
pass
```
"""
from typing import NamedTuple
class InstrumentationEntry(NamedTuple):
provider_class: str # fully-qualified class path
config_class: str # fully-qualified class path
description: str
instrumentation_registry: dict[str, InstrumentationEntry] = {
"otel": InstrumentationEntry(
provider_class="llama_stack.providers.inline.instrumentation.otel.otel.OTelInstrumentationProvider",
config_class="llama_stack.providers.inline.instrumentation.otel.config.OTelConfig",
description="OpenTelemetry instrumentation",
),
}

View file

@ -26,16 +26,4 @@ def available_providers() -> list[ProviderSpec]:
config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig",
description="Meta's reference implementation of telemetry and observability using OpenTelemetry.",
),
InlineProviderSpec(
api=Api.telemetry,
provider_type="inline::otel",
pip_packages=[
"opentelemetry-sdk",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-instrumentation-fastapi",
],
module="llama_stack.providers.inline.telemetry.otel",
config_class="llama_stack.providers.inline.telemetry.otel.config.OTelTelemetryConfig",
description="Native OpenTelemetry provider with full access to OTel Tracer and Meter APIs for advanced instrumentation.",
),
]

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -395,7 +395,6 @@ def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
pytest.fail(f"Mock vLLM not accessible before starting Llama Stack: {e}")
# Create run.yaml with inference and telemetry providers
# **TO ADD MORE PROVIDERS:** Add to providers dict
run_config = {
"image_name": "test-otel-e2e",
"apis": ["inference"],
@ -409,16 +408,16 @@ def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
},
},
],
"telemetry": [
{
"provider_id": "otel",
"provider_type": "inline::otel",
"config": {
"service_name": "llama-stack-e2e-test",
"span_processor": "simple",
},
},
],
},
"instrumentation": {
"provider": "otel", # Discriminator for Pydantic
"config": {
"service_name": "llama-stack-e2e-test",
"span_processor": "simple",
},
},
"server": {
"host": "127.0.0.1",
},
"models": [
{
@ -485,7 +484,7 @@ def llama_stack_server(tmp_path_factory, mock_otlp_collector, mock_vllm_server):
# Wait for server to start
max_wait = 30
base_url = f"http://localhost:{port}"
base_url = f"http://127.0.0.1:{port}"
startup_output = []
for i in range(max_wait):