mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
fix(telemetry): add integration and unit tests for otel provider
This commit is contained in:
parent
e45e77f7b0
commit
9a0294ab4f
11 changed files with 1052 additions and 30 deletions
|
@ -48,12 +48,7 @@ from llama_stack.core.utils.config import redact_sensitive_fields
|
|||
from llama_stack.core.utils.context import preserve_contexts_async_generator
|
||||
from llama_stack.core.utils.exec import in_notebook
|
||||
from llama_stack.log import get_logger
|
||||
from llama_stack.providers.utils.telemetry.tracing import (
|
||||
CURRENT_TRACE_CONTEXT,
|
||||
end_trace,
|
||||
setup_logger,
|
||||
start_trace,
|
||||
)
|
||||
|
||||
|
||||
logger = get_logger(name=__name__, category="core")
|
||||
|
||||
|
@ -293,8 +288,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|||
raise _e
|
||||
|
||||
assert self.impls is not None
|
||||
if Api.telemetry in self.impls:
|
||||
setup_logger(self.impls[Api.telemetry])
|
||||
|
||||
if not os.environ.get("PYTEST_CURRENT_TEST"):
|
||||
console = Console()
|
||||
|
@ -380,13 +373,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|||
body, field_names = self._handle_file_uploads(options, body)
|
||||
|
||||
body = self._convert_body(path, options.method, body, exclude_params=set(field_names))
|
||||
|
||||
trace_path = webmethod.descriptive_name or route_path
|
||||
await start_trace(trace_path, {"__location__": "library_client"})
|
||||
try:
|
||||
result = await matched_func(**body)
|
||||
finally:
|
||||
await end_trace()
|
||||
result = await matched_func(**body)
|
||||
|
||||
# Handle FastAPI Response objects (e.g., from file content retrieval)
|
||||
if isinstance(result, FastAPIResponse):
|
||||
|
@ -444,9 +431,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|||
|
||||
body = self._convert_body(path, options.method, body)
|
||||
|
||||
trace_path = webmethod.descriptive_name or route_path
|
||||
await start_trace(trace_path, {"__location__": "library_client"})
|
||||
|
||||
async def gen():
|
||||
try:
|
||||
async for chunk in await func(**body):
|
||||
|
@ -454,9 +438,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|||
sse_event = f"data: {data}\n\n"
|
||||
yield sse_event.encode("utf-8")
|
||||
finally:
|
||||
await end_trace()
|
||||
pass
|
||||
|
||||
wrapped_gen = preserve_contexts_async_generator(gen(), [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR])
|
||||
wrapped_gen = preserve_contexts_async_generator(gen(), [PROVIDER_DATA_VAR])
|
||||
|
||||
mock_response = httpx.Response(
|
||||
status_code=httpx.codes.OK,
|
||||
|
|
|
@ -74,6 +74,8 @@ logger = get_logger(name=__name__, category="core::server")
|
|||
|
||||
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
|
||||
log = file if hasattr(file, "write") else sys.stderr
|
||||
if log is None:
|
||||
return
|
||||
traceback.print_stack(file=log)
|
||||
log.write(warnings.formatwarning(message, category, filename, lineno, line))
|
||||
|
||||
|
|
0
llama_stack/core/telemetry/__initi__.py
Normal file
0
llama_stack/core/telemetry/__initi__.py
Normal file
49
llama_stack/core/telemetry/telemetry.py
Normal file
49
llama_stack/core/telemetry/telemetry.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
from abc import abstractmethod
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from typing import Any
|
||||
|
||||
|
||||
class TelemetryProvider(BaseModel):
|
||||
"""
|
||||
TelemetryProvider standardizes how telemetry is provided to the application.
|
||||
"""
|
||||
@abstractmethod
|
||||
def fastapi_middleware(self, app: FastAPI, *args, **kwargs):
|
||||
"""
|
||||
Injects FastAPI middleware that instruments the application for telemetry.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def custom_trace(self, name: str, *args, **kwargs) -> Any:
|
||||
"""
|
||||
Creates a custom trace.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def record_count(self, name: str, *args, **kwargs):
|
||||
"""
|
||||
Increments a counter metric.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def record_histogram(self, name: str, *args, **kwargs):
|
||||
"""
|
||||
Records a histogram metric.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def record_up_down_counter(self, name: str, *args, **kwargs):
|
||||
"""
|
||||
Records an up/down counter metric.
|
||||
"""
|
||||
...
|
|
@ -10,17 +10,23 @@ First, bootstrap and install all necessary libraries for open telemtry:
|
|||
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
|
||||
```
|
||||
|
||||
Then, run with automatic code injection:
|
||||
|
||||
Make sure you export required environment variables for open telemetry:
|
||||
```
|
||||
uv run opentelemetry-instrument llama stack run --config myconfig.yaml
|
||||
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
|
||||
export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
|
||||
```
|
||||
|
||||
### Excluded Fast API URLs
|
||||
If you want certian endpoints to be ignored from the fast API telemetry, set the following environment variable:
|
||||
|
||||
```
|
||||
export OTEL_PYTHON_FASTAPI_EXCLUDED_URLS="client/.*/info,healthcheck"
|
||||
```
|
||||
|
||||
#### Environment Variables
|
||||
Finaly, run Llama Stack with automatic code injection:
|
||||
|
||||
```
|
||||
uv run opentelemetry-instrument llama stack run --config myconfig.yaml
|
||||
```
|
||||
|
||||
#### Open Telemetry Configuration Environment Variables
|
||||
Environment Variables: https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/
|
||||
|
|
|
@ -6,6 +6,7 @@ from pydantic import BaseModel, Field
|
|||
type BatchSpanProcessor = Literal["batch"]
|
||||
type SimpleSpanProcessor = Literal["simple"]
|
||||
|
||||
|
||||
class OTelTelemetryConfig(BaseModel):
|
||||
"""
|
||||
The configuration for the OpenTelemetry telemetry provider.
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
import os
|
||||
import threading
|
||||
|
||||
from opentelemetry import trace, metrics
|
||||
from opentelemetry.context.context import Context
|
||||
from opentelemetry.sdk.resources import Attributes, Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.metrics import Counter, UpDownCounter, Histogram, ObservableGauge
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.trace import Span, SpanKind, _Links
|
||||
from typing import Sequence
|
||||
from pydantic import PrivateAttr
|
||||
|
||||
from llama_stack.core.telemetry.tracing import TelemetryProvider
|
||||
from llama_stack.log import get_logger
|
||||
|
@ -22,8 +28,17 @@ class OTelTelemetryProvider(TelemetryProvider):
|
|||
"""
|
||||
A simple Open Telemetry native telemetry provider.
|
||||
"""
|
||||
def __init__(self, config: OTelTelemetryConfig):
|
||||
self.config = config
|
||||
config: OTelTelemetryConfig
|
||||
_counters: dict[str, Counter] = PrivateAttr(default_factory=dict)
|
||||
_up_down_counters: dict[str, UpDownCounter] = PrivateAttr(default_factory=dict)
|
||||
_histograms: dict[str, Histogram] = PrivateAttr(default_factory=dict)
|
||||
_gauges: dict[str, ObservableGauge] = PrivateAttr(default_factory=dict)
|
||||
|
||||
|
||||
def model_post_init(self, __context):
|
||||
"""Initialize provider after Pydantic validation."""
|
||||
self._lock = threading.Lock()
|
||||
|
||||
attributes: Attributes = {
|
||||
key: value
|
||||
for key, value in {
|
||||
|
@ -52,7 +67,7 @@ class OTelTelemetryProvider(TelemetryProvider):
|
|||
meter_provider = MeterProvider(resource=resource)
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
|
||||
# Do not fail the application, but warn the user if the endpoints are not set properly
|
||||
# Do not fail the application, but warn the user if the endpoints are not set properly.
|
||||
if not os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
|
||||
if not os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
|
||||
logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is not set. Traces will not be exported.")
|
||||
|
@ -61,3 +76,66 @@ class OTelTelemetryProvider(TelemetryProvider):
|
|||
|
||||
def fastapi_middleware(self, app: FastAPI):
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
def custom_trace(self,
|
||||
name: str,
|
||||
context: Context | None = None,
|
||||
kind: SpanKind = SpanKind.INTERNAL,
|
||||
attributes: Attributes = {},
|
||||
links: _Links = None,
|
||||
start_time: int | None = None,
|
||||
record_exception: bool = True,
|
||||
set_status_on_exception: bool = True) -> Span:
|
||||
"""
|
||||
Creates a custom tracing span using the Open Telemetry SDK.
|
||||
"""
|
||||
tracer = trace.get_tracer(__name__)
|
||||
return tracer.start_span(name, context, kind, attributes, links, start_time, record_exception, set_status_on_exception)
|
||||
|
||||
|
||||
def record_count(self, name: str, amount: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
|
||||
"""
|
||||
Increments a counter metric using the Open Telemetry SDK that are indexed by the meter name.
|
||||
This function is designed to be compatible with other popular telemetry providers design patterns,
|
||||
like Datadog and New Relic.
|
||||
"""
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
with self._lock:
|
||||
if name not in self._counters:
|
||||
self._counters[name] = meter.create_counter(name, unit=unit, description=description)
|
||||
counter = self._counters[name]
|
||||
|
||||
counter.add(amount, attributes=attributes, context=context)
|
||||
|
||||
|
||||
def record_histogram(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = "", explicit_bucket_boundaries_advisory: Sequence[float] | None = None):
|
||||
"""
|
||||
Records a histogram metric using the Open Telemetry SDK that are indexed by the meter name.
|
||||
This function is designed to be compatible with other popular telemetry providers design patterns,
|
||||
like Datadog and New Relic.
|
||||
"""
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
with self._lock:
|
||||
if name not in self._histograms:
|
||||
self._histograms[name] = meter.create_histogram(name, unit=unit, description=description, explicit_bucket_boundaries_advisory=explicit_bucket_boundaries_advisory)
|
||||
histogram = self._histograms[name]
|
||||
|
||||
histogram.record(value, attributes=attributes, context=context)
|
||||
|
||||
|
||||
def record_up_down_counter(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
|
||||
"""
|
||||
Records an up/down counter metric using the Open Telemetry SDK that are indexed by the meter name.
|
||||
This function is designed to be compatible with other popular telemetry providers design patterns,
|
||||
like Datadog and New Relic.
|
||||
"""
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
with self._lock:
|
||||
if name not in self._up_down_counters:
|
||||
self._up_down_counters[name] = meter.create_up_down_counter(name, unit=unit, description=description)
|
||||
up_down_counter = self._up_down_counters[name]
|
||||
|
||||
up_down_counter.add(value, attributes=attributes, context=context)
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
#
|
||||
# Deprecated. Use the Open Telemetry SDK instead.
|
||||
|
||||
import asyncio
|
||||
import contextvars
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue