feat(telemetry:major): End to End Testing, Metric Capture, SQL Alchemy Injection

This commit is contained in:
Emilio Garcia 2025-10-03 12:17:41 -04:00
parent 9a0294ab4f
commit 4aa2dc110d
19 changed files with 1854 additions and 881 deletions

View file

@ -424,6 +424,7 @@ def create_app(
if Api.telemetry in impls:
impls[Api.telemetry].fastapi_middleware(app)
impls[Api.telemetry].sqlalchemy_instrumentation()
# Load external APIs if configured
external_apis = load_external_apis(config)

View file

@ -6,7 +6,13 @@
from abc import abstractmethod
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Any
from opentelemetry.trace import Tracer
from opentelemetry.metrics import Meter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Attributes
from sqlalchemy import Engine
class TelemetryProvider(BaseModel):
@ -19,31 +25,34 @@ class TelemetryProvider(BaseModel):
Injects FastAPI middleware that instruments the application for telemetry.
"""
...
@abstractmethod
def custom_trace(self, name: str, *args, **kwargs) -> Any:
def sqlalchemy_instrumentation(self, engine: Engine | None = None):
"""
Creates a custom trace.
Injects SQLAlchemy instrumentation that instruments the application for telemetry.
"""
...
@abstractmethod
def record_count(self, name: str, *args, **kwargs):
def get_tracer(self,
instrumenting_module_name: str,
instrumenting_library_version: str | None = None,
tracer_provider: TracerProvider | None = None,
schema_url: str | None = None,
attributes: Attributes | None = None
) -> Tracer:
"""
Increments a counter metric.
Gets a tracer.
"""
...
@abstractmethod
def record_histogram(self, name: str, *args, **kwargs):
def get_meter(self, name: str,
version: str = "",
meter_provider: MeterProvider | None = None,
schema_url: str | None = None,
attributes: Attributes | None = None) -> Meter:
"""
Records a histogram metric.
"""
...
@abstractmethod
def record_up_down_counter(self, name: str, *args, **kwargs):
"""
Records an up/down counter metric.
Gets a meter.
"""
...

View file

@ -1,20 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from abc import abstractmethod
from fastapi import FastAPI
from pydantic import BaseModel
class TelemetryProvider(BaseModel):
"""
TelemetryProvider standardizes how telemetry is provided to the application.
"""
@abstractmethod
def fastapi_middleware(self, app: FastAPI, *args, **kwargs):
"""
Injects FastAPI middleware that instruments the application for telemetry.
"""
...

View file

@ -0,0 +1,24 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .config import OTelTelemetryConfig
__all__ = ["OTelTelemetryConfig"]
async def get_provider_impl(config: OTelTelemetryConfig, deps):
"""
Get the OTel telemetry provider implementation.
This function is called by the Llama Stack registry to instantiate
the provider.
"""
from .otel import OTelTelemetryProvider
# The provider is synchronously initialized via Pydantic model_post_init
# No async initialization needed
return OTelTelemetryProvider(config=config)

View file

@ -1,4 +1,4 @@
from typing import Literal
from typing import Any, Literal
from pydantic import BaseModel, Field
@ -11,17 +11,19 @@ class OTelTelemetryConfig(BaseModel):
"""
The configuration for the OpenTelemetry telemetry provider.
Most configuration is set using environment variables.
See https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/ for more information.
See https://opentelemetry.io/docs/specs/otel/configuration/sdk-configuration-variables/ for more information.
"""
service_name: str = Field(
description="""The name of the service to be monitored.
Is overridden by the OTEL_SERVICE_NAME or OTEL_RESOURCE_ATTRIBUTES environment variables.""",
)
service_version: str | None = Field(
default=None,
description="""The version of the service to be monitored.
Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable."""
)
deployment_environment: str | None = Field(
default=None,
description="""The name of the environment of the service to be monitored.
Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable."""
)
@ -30,3 +32,13 @@ class OTelTelemetryConfig(BaseModel):
Is overriden by the OTEL_SPAN_PROCESSOR environment variable.""",
default="batch"
)
@classmethod
def sample_run_config(cls, __distro_dir__: str = "") -> dict[str, Any]:
"""Sample configuration for use in distributions."""
return {
"service_name": "${env.OTEL_SERVICE_NAME:=llama-stack}",
"service_version": "${env.OTEL_SERVICE_VERSION:=}",
"deployment_environment": "${env.OTEL_DEPLOYMENT_ENVIRONMENT:=}",
"span_processor": "${env.OTEL_SPAN_PROCESSOR:=batch}",
}

View file

@ -1,22 +1,21 @@
import os
import threading
from opentelemetry import trace, metrics
from opentelemetry.context.context import Context
from opentelemetry.sdk.resources import Attributes, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.metrics import Counter, UpDownCounter, Histogram, ObservableGauge
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.trace import Span, SpanKind, _Links
from typing import Sequence
from pydantic import PrivateAttr
from opentelemetry.trace import Tracer
from opentelemetry.metrics import Meter
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from llama_stack.core.telemetry.tracing import TelemetryProvider
from llama_stack.core.telemetry.telemetry import TelemetryProvider
from llama_stack.log import get_logger
from sqlalchemy import Engine
from .config import OTelTelemetryConfig
from fastapi import FastAPI
@ -29,15 +28,9 @@ class OTelTelemetryProvider(TelemetryProvider):
A simple Open Telemetry native telemetry provider.
"""
config: OTelTelemetryConfig
_counters: dict[str, Counter] = PrivateAttr(default_factory=dict)
_up_down_counters: dict[str, UpDownCounter] = PrivateAttr(default_factory=dict)
_histograms: dict[str, Histogram] = PrivateAttr(default_factory=dict)
_gauges: dict[str, ObservableGauge] = PrivateAttr(default_factory=dict)
def model_post_init(self, __context):
"""Initialize provider after Pydantic validation."""
self._lock = threading.Lock()
attributes: Attributes = {
key: value
@ -74,68 +67,114 @@ class OTelTelemetryProvider(TelemetryProvider):
if not os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"):
logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_METRICS_ENDPOINT is not set. Metrics will not be exported.")
def fastapi_middleware(self, app: FastAPI):
"""
Instrument FastAPI with OTel for automatic tracing and metrics.
Captures:
- Distributed traces for all HTTP requests (via FastAPIInstrumentor)
- HTTP metrics following semantic conventions (custom middleware)
"""
# Enable automatic tracing
FastAPIInstrumentor.instrument_app(app)
# Add custom middleware for HTTP metrics
meter = self.get_meter("llama_stack.http.server")
# Create HTTP metrics following semantic conventions
# https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
request_duration = meter.create_histogram(
"http.server.request.duration",
unit="ms",
description="Duration of HTTP server requests"
)
active_requests = meter.create_up_down_counter(
"http.server.active_requests",
unit="requests",
description="Number of active HTTP server requests"
)
request_count = meter.create_counter(
"http.server.request.count",
unit="requests",
description="Total number of HTTP server requests"
)
# Add middleware to record metrics
@app.middleware("http") # type: ignore[misc]
async def http_metrics_middleware(request, call_next):
import time
# Record active request
active_requests.add(1, {
"http.method": request.method,
"http.route": request.url.path,
})
start_time = time.time()
status_code = 500 # Default to error
try:
response = await call_next(request)
status_code = response.status_code
except Exception:
raise
finally:
# Record metrics
duration_ms = (time.time() - start_time) * 1000
attributes = {
"http.method": request.method,
"http.route": request.url.path,
"http.status_code": status_code,
}
request_duration.record(duration_ms, attributes)
request_count.add(1, attributes)
active_requests.add(-1, {
"http.method": request.method,
"http.route": request.url.path,
})
return response
def custom_trace(self,
def sqlalchemy_instrumentation(self, engine: Engine | None = None):
kwargs = {}
if engine:
kwargs["engine"] = engine
SQLAlchemyInstrumentor().instrument(**kwargs)
def get_tracer(self,
instrumenting_module_name: str,
instrumenting_library_version: str | None = None,
tracer_provider: TracerProvider | None = None,
schema_url: str | None = None,
attributes: Attributes | None = None
) -> Tracer:
return trace.get_tracer(
instrumenting_module_name=instrumenting_module_name,
instrumenting_library_version=instrumenting_library_version,
tracer_provider=tracer_provider,
schema_url=schema_url,
attributes=attributes
)
def get_meter(self,
name: str,
context: Context | None = None,
kind: SpanKind = SpanKind.INTERNAL,
attributes: Attributes = {},
links: _Links = None,
start_time: int | None = None,
record_exception: bool = True,
set_status_on_exception: bool = True) -> Span:
"""
Creates a custom tracing span using the Open Telemetry SDK.
"""
tracer = trace.get_tracer(__name__)
return tracer.start_span(name, context, kind, attributes, links, start_time, record_exception, set_status_on_exception)
def record_count(self, name: str, amount: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
"""
Increments a counter metric using the Open Telemetry SDK that are indexed by the meter name.
This function is designed to be compatible with other popular telemetry providers design patterns,
like Datadog and New Relic.
"""
meter = metrics.get_meter(__name__)
with self._lock:
if name not in self._counters:
self._counters[name] = meter.create_counter(name, unit=unit, description=description)
counter = self._counters[name]
counter.add(amount, attributes=attributes, context=context)
def record_histogram(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = "", explicit_bucket_boundaries_advisory: Sequence[float] | None = None):
"""
Records a histogram metric using the Open Telemetry SDK that are indexed by the meter name.
This function is designed to be compatible with other popular telemetry providers design patterns,
like Datadog and New Relic.
"""
meter = metrics.get_meter(__name__)
with self._lock:
if name not in self._histograms:
self._histograms[name] = meter.create_histogram(name, unit=unit, description=description, explicit_bucket_boundaries_advisory=explicit_bucket_boundaries_advisory)
histogram = self._histograms[name]
histogram.record(value, attributes=attributes, context=context)
def record_up_down_counter(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
"""
Records an up/down counter metric using the Open Telemetry SDK that are indexed by the meter name.
This function is designed to be compatible with other popular telemetry providers design patterns,
like Datadog and New Relic.
"""
meter = metrics.get_meter(__name__)
with self._lock:
if name not in self._up_down_counters:
self._up_down_counters[name] = meter.create_up_down_counter(name, unit=unit, description=description)
up_down_counter = self._up_down_counters[name]
up_down_counter.add(value, attributes=attributes, context=context)
version: str = "",
meter_provider: MeterProvider | None = None,
schema_url: str | None = None,
attributes: Attributes | None = None
) -> Meter:
return metrics.get_meter(
name=name,
version=version,
meter_provider=meter_provider,
schema_url=schema_url,
attributes=attributes
)

View file

@ -26,4 +26,16 @@ def available_providers() -> list[ProviderSpec]:
config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig",
description="Meta's reference implementation of telemetry and observability using OpenTelemetry.",
),
InlineProviderSpec(
api=Api.telemetry,
provider_type="inline::otel",
pip_packages=[
"opentelemetry-sdk",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-instrumentation-fastapi",
],
module="llama_stack.providers.inline.telemetry.otel",
config_class="llama_stack.providers.inline.telemetry.otel.config.OTelTelemetryConfig",
description="Native OpenTelemetry provider with full access to OTel Tracer and Meter APIs for advanced instrumentation.",
),
]