mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
feat(telemetry:major): End to End Testing, Metric Capture, SQL Alchemy Injection
This commit is contained in:
parent
9a0294ab4f
commit
4aa2dc110d
19 changed files with 1854 additions and 881 deletions
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import OTelTelemetryConfig
|
||||
|
||||
__all__ = ["OTelTelemetryConfig"]
|
||||
|
||||
|
||||
async def get_provider_impl(config: OTelTelemetryConfig, deps):
|
||||
"""
|
||||
Get the OTel telemetry provider implementation.
|
||||
|
||||
This function is called by the Llama Stack registry to instantiate
|
||||
the provider.
|
||||
"""
|
||||
from .otel import OTelTelemetryProvider
|
||||
|
||||
# The provider is synchronously initialized via Pydantic model_post_init
|
||||
# No async initialization needed
|
||||
return OTelTelemetryProvider(config=config)
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Literal
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
@ -11,17 +11,19 @@ class OTelTelemetryConfig(BaseModel):
|
|||
"""
|
||||
The configuration for the OpenTelemetry telemetry provider.
|
||||
Most configuration is set using environment variables.
|
||||
See https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/ for more information.
|
||||
See https://opentelemetry.io/docs/specs/otel/configuration/sdk-configuration-variables/ for more information.
|
||||
"""
|
||||
service_name: str = Field(
|
||||
description="""The name of the service to be monitored.
|
||||
Is overridden by the OTEL_SERVICE_NAME or OTEL_RESOURCE_ATTRIBUTES environment variables.""",
|
||||
)
|
||||
service_version: str | None = Field(
|
||||
default=None,
|
||||
description="""The version of the service to be monitored.
|
||||
Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable."""
|
||||
)
|
||||
deployment_environment: str | None = Field(
|
||||
default=None,
|
||||
description="""The name of the environment of the service to be monitored.
|
||||
Is overriden by the OTEL_RESOURCE_ATTRIBUTES environment variable."""
|
||||
)
|
||||
|
@ -30,3 +32,13 @@ class OTelTelemetryConfig(BaseModel):
|
|||
Is overriden by the OTEL_SPAN_PROCESSOR environment variable.""",
|
||||
default="batch"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(cls, __distro_dir__: str = "") -> dict[str, Any]:
|
||||
"""Sample configuration for use in distributions."""
|
||||
return {
|
||||
"service_name": "${env.OTEL_SERVICE_NAME:=llama-stack}",
|
||||
"service_version": "${env.OTEL_SERVICE_VERSION:=}",
|
||||
"deployment_environment": "${env.OTEL_DEPLOYMENT_ENVIRONMENT:=}",
|
||||
"span_processor": "${env.OTEL_SPAN_PROCESSOR:=batch}",
|
||||
}
|
||||
|
|
|
@ -1,22 +1,21 @@
|
|||
import os
|
||||
import threading
|
||||
|
||||
from opentelemetry import trace, metrics
|
||||
from opentelemetry.context.context import Context
|
||||
from opentelemetry.sdk.resources import Attributes, Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.metrics import Counter, UpDownCounter, Histogram, ObservableGauge
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.trace import Span, SpanKind, _Links
|
||||
from typing import Sequence
|
||||
from pydantic import PrivateAttr
|
||||
from opentelemetry.trace import Tracer
|
||||
from opentelemetry.metrics import Meter
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
|
||||
from llama_stack.core.telemetry.tracing import TelemetryProvider
|
||||
from llama_stack.core.telemetry.telemetry import TelemetryProvider
|
||||
from llama_stack.log import get_logger
|
||||
|
||||
from sqlalchemy import Engine
|
||||
|
||||
from .config import OTelTelemetryConfig
|
||||
from fastapi import FastAPI
|
||||
|
||||
|
@ -29,15 +28,9 @@ class OTelTelemetryProvider(TelemetryProvider):
|
|||
A simple Open Telemetry native telemetry provider.
|
||||
"""
|
||||
config: OTelTelemetryConfig
|
||||
_counters: dict[str, Counter] = PrivateAttr(default_factory=dict)
|
||||
_up_down_counters: dict[str, UpDownCounter] = PrivateAttr(default_factory=dict)
|
||||
_histograms: dict[str, Histogram] = PrivateAttr(default_factory=dict)
|
||||
_gauges: dict[str, ObservableGauge] = PrivateAttr(default_factory=dict)
|
||||
|
||||
|
||||
def model_post_init(self, __context):
|
||||
"""Initialize provider after Pydantic validation."""
|
||||
self._lock = threading.Lock()
|
||||
|
||||
attributes: Attributes = {
|
||||
key: value
|
||||
|
@ -74,68 +67,114 @@ class OTelTelemetryProvider(TelemetryProvider):
|
|||
if not os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"):
|
||||
logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT or OTEL_EXPORTER_OTLP_METRICS_ENDPOINT is not set. Metrics will not be exported.")
|
||||
|
||||
|
||||
def fastapi_middleware(self, app: FastAPI):
|
||||
"""
|
||||
Instrument FastAPI with OTel for automatic tracing and metrics.
|
||||
|
||||
Captures:
|
||||
- Distributed traces for all HTTP requests (via FastAPIInstrumentor)
|
||||
- HTTP metrics following semantic conventions (custom middleware)
|
||||
"""
|
||||
# Enable automatic tracing
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Add custom middleware for HTTP metrics
|
||||
meter = self.get_meter("llama_stack.http.server")
|
||||
|
||||
# Create HTTP metrics following semantic conventions
|
||||
# https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
|
||||
request_duration = meter.create_histogram(
|
||||
"http.server.request.duration",
|
||||
unit="ms",
|
||||
description="Duration of HTTP server requests"
|
||||
)
|
||||
|
||||
active_requests = meter.create_up_down_counter(
|
||||
"http.server.active_requests",
|
||||
unit="requests",
|
||||
description="Number of active HTTP server requests"
|
||||
)
|
||||
|
||||
request_count = meter.create_counter(
|
||||
"http.server.request.count",
|
||||
unit="requests",
|
||||
description="Total number of HTTP server requests"
|
||||
)
|
||||
|
||||
# Add middleware to record metrics
|
||||
@app.middleware("http") # type: ignore[misc]
|
||||
async def http_metrics_middleware(request, call_next):
|
||||
import time
|
||||
|
||||
# Record active request
|
||||
active_requests.add(1, {
|
||||
"http.method": request.method,
|
||||
"http.route": request.url.path,
|
||||
})
|
||||
|
||||
start_time = time.time()
|
||||
status_code = 500 # Default to error
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
status_code = response.status_code
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
# Record metrics
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
|
||||
attributes = {
|
||||
"http.method": request.method,
|
||||
"http.route": request.url.path,
|
||||
"http.status_code": status_code,
|
||||
}
|
||||
|
||||
request_duration.record(duration_ms, attributes)
|
||||
request_count.add(1, attributes)
|
||||
active_requests.add(-1, {
|
||||
"http.method": request.method,
|
||||
"http.route": request.url.path,
|
||||
})
|
||||
|
||||
return response
|
||||
|
||||
def custom_trace(self,
|
||||
|
||||
def sqlalchemy_instrumentation(self, engine: Engine | None = None):
|
||||
kwargs = {}
|
||||
if engine:
|
||||
kwargs["engine"] = engine
|
||||
SQLAlchemyInstrumentor().instrument(**kwargs)
|
||||
|
||||
|
||||
def get_tracer(self,
|
||||
instrumenting_module_name: str,
|
||||
instrumenting_library_version: str | None = None,
|
||||
tracer_provider: TracerProvider | None = None,
|
||||
schema_url: str | None = None,
|
||||
attributes: Attributes | None = None
|
||||
) -> Tracer:
|
||||
return trace.get_tracer(
|
||||
instrumenting_module_name=instrumenting_module_name,
|
||||
instrumenting_library_version=instrumenting_library_version,
|
||||
tracer_provider=tracer_provider,
|
||||
schema_url=schema_url,
|
||||
attributes=attributes
|
||||
)
|
||||
|
||||
|
||||
def get_meter(self,
|
||||
name: str,
|
||||
context: Context | None = None,
|
||||
kind: SpanKind = SpanKind.INTERNAL,
|
||||
attributes: Attributes = {},
|
||||
links: _Links = None,
|
||||
start_time: int | None = None,
|
||||
record_exception: bool = True,
|
||||
set_status_on_exception: bool = True) -> Span:
|
||||
"""
|
||||
Creates a custom tracing span using the Open Telemetry SDK.
|
||||
"""
|
||||
tracer = trace.get_tracer(__name__)
|
||||
return tracer.start_span(name, context, kind, attributes, links, start_time, record_exception, set_status_on_exception)
|
||||
|
||||
|
||||
def record_count(self, name: str, amount: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
|
||||
"""
|
||||
Increments a counter metric using the Open Telemetry SDK that are indexed by the meter name.
|
||||
This function is designed to be compatible with other popular telemetry providers design patterns,
|
||||
like Datadog and New Relic.
|
||||
"""
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
with self._lock:
|
||||
if name not in self._counters:
|
||||
self._counters[name] = meter.create_counter(name, unit=unit, description=description)
|
||||
counter = self._counters[name]
|
||||
|
||||
counter.add(amount, attributes=attributes, context=context)
|
||||
|
||||
|
||||
def record_histogram(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = "", explicit_bucket_boundaries_advisory: Sequence[float] | None = None):
|
||||
"""
|
||||
Records a histogram metric using the Open Telemetry SDK that are indexed by the meter name.
|
||||
This function is designed to be compatible with other popular telemetry providers design patterns,
|
||||
like Datadog and New Relic.
|
||||
"""
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
with self._lock:
|
||||
if name not in self._histograms:
|
||||
self._histograms[name] = meter.create_histogram(name, unit=unit, description=description, explicit_bucket_boundaries_advisory=explicit_bucket_boundaries_advisory)
|
||||
histogram = self._histograms[name]
|
||||
|
||||
histogram.record(value, attributes=attributes, context=context)
|
||||
|
||||
|
||||
def record_up_down_counter(self, name: str, value: int|float, context: Context | None = None, attributes: dict[str, str] | None = None, unit: str = "", description: str = ""):
|
||||
"""
|
||||
Records an up/down counter metric using the Open Telemetry SDK that are indexed by the meter name.
|
||||
This function is designed to be compatible with other popular telemetry providers design patterns,
|
||||
like Datadog and New Relic.
|
||||
"""
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
with self._lock:
|
||||
if name not in self._up_down_counters:
|
||||
self._up_down_counters[name] = meter.create_up_down_counter(name, unit=unit, description=description)
|
||||
up_down_counter = self._up_down_counters[name]
|
||||
|
||||
up_down_counter.add(value, attributes=attributes, context=context)
|
||||
version: str = "",
|
||||
meter_provider: MeterProvider | None = None,
|
||||
schema_url: str | None = None,
|
||||
attributes: Attributes | None = None
|
||||
) -> Meter:
|
||||
return metrics.get_meter(
|
||||
name=name,
|
||||
version=version,
|
||||
meter_provider=meter_provider,
|
||||
schema_url=schema_url,
|
||||
attributes=attributes
|
||||
)
|
|
@ -26,4 +26,16 @@ def available_providers() -> list[ProviderSpec]:
|
|||
config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig",
|
||||
description="Meta's reference implementation of telemetry and observability using OpenTelemetry.",
|
||||
),
|
||||
InlineProviderSpec(
|
||||
api=Api.telemetry,
|
||||
provider_type="inline::otel",
|
||||
pip_packages=[
|
||||
"opentelemetry-sdk",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-instrumentation-fastapi",
|
||||
],
|
||||
module="llama_stack.providers.inline.telemetry.otel",
|
||||
config_class="llama_stack.providers.inline.telemetry.otel.config.OTelTelemetryConfig",
|
||||
description="Native OpenTelemetry provider with full access to OTel Tracer and Meter APIs for advanced instrumentation.",
|
||||
),
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue