fix(telemetry): add integration and unit tests for otel provider

This commit is contained in:
Emilio Garcia 2025-10-02 17:46:53 -04:00
parent e45e77f7b0
commit 9a0294ab4f
11 changed files with 1052 additions and 30 deletions

View file

@ -48,12 +48,7 @@ from llama_stack.core.utils.config import redact_sensitive_fields
from llama_stack.core.utils.context import preserve_contexts_async_generator
from llama_stack.core.utils.exec import in_notebook
from llama_stack.log import get_logger
from llama_stack.providers.utils.telemetry.tracing import (
CURRENT_TRACE_CONTEXT,
end_trace,
setup_logger,
start_trace,
)
logger = get_logger(name=__name__, category="core")
@ -293,8 +288,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
raise _e
assert self.impls is not None
if Api.telemetry in self.impls:
setup_logger(self.impls[Api.telemetry])
if not os.environ.get("PYTEST_CURRENT_TEST"):
console = Console()
@ -380,13 +373,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
body, field_names = self._handle_file_uploads(options, body)
body = self._convert_body(path, options.method, body, exclude_params=set(field_names))
trace_path = webmethod.descriptive_name or route_path
await start_trace(trace_path, {"__location__": "library_client"})
try:
result = await matched_func(**body)
finally:
await end_trace()
result = await matched_func(**body)
# Handle FastAPI Response objects (e.g., from file content retrieval)
if isinstance(result, FastAPIResponse):
@ -444,9 +431,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
body = self._convert_body(path, options.method, body)
trace_path = webmethod.descriptive_name or route_path
await start_trace(trace_path, {"__location__": "library_client"})
async def gen():
try:
async for chunk in await func(**body):
@ -454,9 +438,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
sse_event = f"data: {data}\n\n"
yield sse_event.encode("utf-8")
finally:
await end_trace()
pass
wrapped_gen = preserve_contexts_async_generator(gen(), [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR])
wrapped_gen = preserve_contexts_async_generator(gen(), [PROVIDER_DATA_VAR])
mock_response = httpx.Response(
status_code=httpx.codes.OK,

View file

@ -74,6 +74,8 @@ logger = get_logger(name=__name__, category="core::server")
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
log = file if hasattr(file, "write") else sys.stderr
if log is None:
return
traceback.print_stack(file=log)
log.write(warnings.formatwarning(message, category, filename, lineno, line))

View file

View file

@ -0,0 +1,49 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from abc import abstractmethod
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Any
class TelemetryProvider(BaseModel):
"""
TelemetryProvider standardizes how telemetry is provided to the application.
"""
@abstractmethod
def fastapi_middleware(self, app: FastAPI, *args, **kwargs):
"""
Injects FastAPI middleware that instruments the application for telemetry.
"""
...
@abstractmethod
def custom_trace(self, name: str, *args, **kwargs) -> Any:
"""
Creates a custom trace.
"""
...
@abstractmethod
def record_count(self, name: str, *args, **kwargs):
"""
Increments a counter metric.
"""
...
@abstractmethod
def record_histogram(self, name: str, *args, **kwargs):
"""
Records a histogram metric.
"""
...
@abstractmethod
def record_up_down_counter(self, name: str, *args, **kwargs):
"""
Records an up/down counter metric.
"""
...

View file

@ -10,17 +10,23 @@ First, bootstrap and install all necessary libraries for open telemtry:
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
```
Then, run with automatic code injection:
Make sure you export required environment variables for open telemetry:
```
uv run opentelemetry-instrument llama stack run --config myconfig.yaml
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
```
### Excluded Fast API URLs
If you want certian endpoints to be ignored from the fast API telemetry, set the following environment variable:
```
export OTEL_PYTHON_FASTAPI_EXCLUDED_URLS="client/.*/info,healthcheck"
```
#### Environment Variables
Finaly, run Llama Stack with automatic code injection:
```
uv run opentelemetry-instrument llama stack run --config myconfig.yaml
```
#### Open Telemetry Configuration Environment Variables
Environment Variables: https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/

View file

@ -6,6 +6,7 @@ from pydantic import BaseModel, Field
type BatchSpanProcessor = Literal["batch"]
type SimpleSpanProcessor = Literal["simple"]
class OTelTelemetryConfig(BaseModel):
"""
The configuration for the OpenTelemetry telemetry provider.

View file

@ -1,12 +1,18 @@
import os
import threading
from opentelemetry import trace, metrics
from opentelemetry.context.context import Context
from opentelemetry.sdk.resources import Attributes, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.metrics import Counter, UpDownCounter, Histogram, ObservableGauge
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.trace import Span, SpanKind, _Links
from typing import Sequence
from pydantic import PrivateAttr
from llama_stack.core.telemetry.tracing import TelemetryProvider
from llama_stack.log import get_logger
@ -22,8 +28,17 @@ class OTelTelemetryProvider(TelemetryProvider):
"""
A simple Open Telemetry native telemetry provider.
"""
def __init__(self, config: OTelTelemetryConfig):
self.config = config
config: OTelTelemetryConfig
_counters: dict[str, Counter] = PrivateAttr(default_factory=dict)
_up_down_counters: dict[str, UpDownCounter] = PrivateAttr(default_factory=dict)
_histograms: dict[str, Histogram] = PrivateAttr(default_factory=dict)
_gauges: dict[str, ObservableGauge] = PrivateAttr(default_factory=dict)
def model_post_init(self, __context):
"""Initialize provider after Pydantic validation."""
self._lock = threading.Lock()
attributes: Attributes = {
key: value
for key, value in {
@ -52,7 +67,7 @@ class OTelTelemetryProvider(TelemetryProvider):
meter_provider = MeterProvider(resource=resource)
metrics.set_meter_provider(meter_provider)
# Do not fail the application, but warn the user if the endpoints are not set properly
# Do not fail the application, but warn the user if the endpoints are not set properly.
if not os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
if not os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is not set. Traces will not be exported.")
@ -61,3 +76,66 @@ class OTelTelemetryProvider(TelemetryProvider):
def fastapi_middleware(self, app: FastAPI):
FastAPIInstrumentor.instrument_app(app)
def custom_trace(self,
name: str,
context: Context | None = None,
kind: SpanKind = SpanKind.INTERNAL,
attributes: Attributes = {},
links: _Links = None,
start_time: int | None = None,
record_exception: bool = True,
set_status_on_exception: bool = True) -> Span:
"""
Creates a custom tracing span using the Open Telemetry SDK.
"""
tracer = trace.get_tracer(__name__)
return tracer.start_span(name, context, kind, attributes, links, start_time, record_exception, set_status_on_exception)
def record_count(self, name: str, amount: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
"""
Increments a counter metric using the Open Telemetry SDK that are indexed by the meter name.
This function is designed to be compatible with other popular telemetry providers design patterns,
like Datadog and New Relic.
"""
meter = metrics.get_meter(__name__)
with self._lock:
if name not in self._counters:
self._counters[name] = meter.create_counter(name, unit=unit, description=description)
counter = self._counters[name]
counter.add(amount, attributes=attributes, context=context)
def record_histogram(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = "", explicit_bucket_boundaries_advisory: Sequence[float] | None = None):
"""
Records a histogram metric using the Open Telemetry SDK that are indexed by the meter name.
This function is designed to be compatible with other popular telemetry providers design patterns,
like Datadog and New Relic.
"""
meter = metrics.get_meter(__name__)
with self._lock:
if name not in self._histograms:
self._histograms[name] = meter.create_histogram(name, unit=unit, description=description, explicit_bucket_boundaries_advisory=explicit_bucket_boundaries_advisory)
histogram = self._histograms[name]
histogram.record(value, attributes=attributes, context=context)
def record_up_down_counter(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
"""
Records an up/down counter metric using the Open Telemetry SDK that are indexed by the meter name.
This function is designed to be compatible with other popular telemetry providers design patterns,
like Datadog and New Relic.
"""
meter = metrics.get_meter(__name__)
with self._lock:
if name not in self._up_down_counters:
self._up_down_counters[name] = meter.create_up_down_counter(name, unit=unit, description=description)
up_down_counter = self._up_down_counters[name]
up_down_counter.add(value, attributes=attributes, context=context)

View file

@ -3,6 +3,8 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
#
# Deprecated. Use the Open Telemetry SDK instead.
import asyncio
import contextvars