Merge branch 'main' into routeur

2025-12-03 01:48:05 +00:00 · 2025-12-02 09:42:09 +01:00 · 2025-12-02 09:42:09 +01:00 · 3ce509e94a
commit 3ce509e94a
parent 98f202b607 ee107aadd6
87 changed files with 67526 additions and 4478 deletions
--- a/.github/workflows/backward-compat.yml
+++ b/.github/workflows/backward-compat.yml
@ -32,7 +32,7 @@ jobs:
          fetch-depth: 0  # Need full history to access main branch
      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
@ -410,7 +410,7 @@ jobs:
          fetch-depth: 0
      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -64,6 +64,7 @@ jobs:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'
      # Cache oasdiff to avoid checksum failures and speed up builds
      - name: Cache oasdiff
        if: steps.skip-check.outputs.skip != 'true'
@ -136,6 +137,23 @@ jobs:
        run: |
          oasdiff breaking --fail-on ERR $BASE_SPEC $CURRENT_SPEC --match-path '^/v1/'
      # Run oasdiff to detect breaking changes in the API specification when compared to the OpenAI openAPI spec
      - name: Run OpenAPI Breaking Change Diff Against OpenAI API
        if: steps.skip-check.outputs.skip != 'true'
        continue-on-error: true
        shell: bash
        run: |
          OPENAI_SPEC=docs/static/openai-spec-2.3.0.yml
          LLAMA_STACK_SPEC=docs/static/llama-stack-spec.yaml
          # Compare Llama Stack spec against OpenAI spec.
          # This finds breaking changes in our implementation of common endpoints.
          # By using our spec as the base, we avoid errors for endpoints we don't implement.
          oasdiff breaking --fail-on ERR \
            "$LLAMA_STACK_SPEC" \
            "$OPENAI_SPEC" \
            --strip-prefix-base "/v1"
      # Report when test is skipped
      - name: Report skip reason
        if: steps.skip-check.outputs.skip == 'true'
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -30,7 +30,7 @@ jobs:
          fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
          cache: pip
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -19,6 +19,7 @@ repos:
    -   id: no-commit-to-branch
    -   id: check-yaml
        args: ["--unsafe"]
        exclude: 'docs/static/openai-spec-2.3.0.yml'
    -   id: detect-private-key
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -9,7 +9,6 @@ data:
    - inference
    - files
    - safety
    - telemetry
    - tool_runtime
    - vector_io
    providers:
@ -67,12 +66,6 @@ data:
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -126,8 +126,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8323
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: chromadb
  default_embedding_model:
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -2101,6 +2101,7 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
      deprecated: true
  /v1/tool-runtime/list-tools:
    get:
      responses:
@ -2152,6 +2153,7 @@ paths:
          - $ref: '#/components/schemas/URL'
          - type: 'null'
          title: Mcp Endpoint
      deprecated: true
  /v1/toolgroups:
    get:
      responses:
@ -2178,6 +2180,7 @@ paths:
      summary: List Tool Groups
      description: List tool groups with optional provider.
      operationId: list_tool_groups_v1_toolgroups_get
      deprecated: true
    post:
      responses:
        '400':
@ -2239,6 +2242,7 @@ paths:
        schema:
          type: string
        description: 'Path parameter: toolgroup_id'
      deprecated: true
    delete:
      responses:
        '400':
@ -2303,6 +2307,7 @@ paths:
          - type: string
          - type: 'null'
          title: Toolgroup Id
      deprecated: true
  /v1/tools/{tool_name}:
    get:
      responses:
@ -2336,6 +2341,7 @@ paths:
        schema:
          type: string
        description: 'Path parameter: tool_name'
      deprecated: true
  /v1/vector-io/insert:
    post:
      responses:
@ -6812,6 +6818,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
        input:
          items:
            anyOf:
@ -7215,6 +7227,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - input
@ -7346,6 +7364,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - created_at
@ -12196,227 +12220,6 @@ components:
      - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
        title: OpenAIResponseContentPartReasoningText
      title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
    SpanEndPayload:
      description: Payload for a span end event.
      properties:
        type:
          const: span_end
          default: span_end
          title: Type
          type: string
        status:
          $ref: '#/components/schemas/SpanStatus'
      required:
      - status
      title: SpanEndPayload
      type: object
    SpanStartPayload:
      description: Payload for a span start event.
      properties:
        type:
          const: span_start
          default: span_start
          title: Type
          type: string
        name:
          title: Name
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
      required:
      - name
      title: SpanStartPayload
      type: object
    SpanStatus:
      description: The status of a span indicating whether it completed successfully or with an error.
      enum:
      - ok
      - error
      title: SpanStatus
      type: string
    StructuredLogPayload:
      discriminator:
        mapping:
          span_end: '#/components/schemas/SpanEndPayload'
          span_start: '#/components/schemas/SpanStartPayload'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/SpanStartPayload'
        title: SpanStartPayload
      - $ref: '#/components/schemas/SpanEndPayload'
        title: SpanEndPayload
      title: SpanStartPayload | SpanEndPayload
    LogSeverity:
      description: The severity level of a log message.
      enum:
      - verbose
      - debug
      - info
      - warn
      - error
      - critical
      title: LogSeverity
      type: string
    MetricEvent:
      description: A metric event containing a measured value.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: metric
          default: metric
          title: Type
          type: string
        metric:
          title: Metric
          type: string
        value:
          anyOf:
          - type: integer
          - type: number
          title: integer | number
        unit:
          title: Unit
          type: string
      required:
      - trace_id
      - span_id
      - timestamp
      - metric
      - value
      - unit
      title: MetricEvent
      type: object
    StructuredLogEvent:
      description: A structured log event containing typed payload data.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: structured_log
          default: structured_log
          title: Type
          type: string
        payload:
          discriminator:
            mapping:
              span_end: '#/components/schemas/SpanEndPayload'
              span_start: '#/components/schemas/SpanStartPayload'
            propertyName: type
          oneOf:
          - $ref: '#/components/schemas/SpanStartPayload'
            title: SpanStartPayload
          - $ref: '#/components/schemas/SpanEndPayload'
            title: SpanEndPayload
          title: SpanStartPayload | SpanEndPayload
      required:
      - trace_id
      - span_id
      - timestamp
      - payload
      title: StructuredLogEvent
      type: object
    UnstructuredLogEvent:
      description: An unstructured log event containing a simple text message.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: unstructured_log
          default: unstructured_log
          title: Type
          type: string
        message:
          title: Message
          type: string
        severity:
          $ref: '#/components/schemas/LogSeverity'
      required:
      - trace_id
      - span_id
      - timestamp
      - message
      - severity
      title: UnstructuredLogEvent
      type: object
    Event:
      discriminator:
        mapping:
          metric: '#/components/schemas/MetricEvent'
          structured_log: '#/components/schemas/StructuredLogEvent'
          unstructured_log: '#/components/schemas/UnstructuredLogEvent'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/UnstructuredLogEvent'
        title: UnstructuredLogEvent
      - $ref: '#/components/schemas/MetricEvent'
        title: MetricEvent
      - $ref: '#/components/schemas/StructuredLogEvent'
        title: StructuredLogEvent
      title: UnstructuredLogEvent | MetricEvent | StructuredLogEvent
    MetricInResponse:
      description: A metric value included in API responses.
      properties:
@ -13279,236 +13082,6 @@ components:
      - logger_config
      title: PostTrainingRLHFRequest
      type: object
    Span:
      description: A span representing a single operation within a trace.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: Span
      type: object
    Trace:
      description: A trace representing the complete execution path of a request across multiple operations.
      properties:
        trace_id:
          title: Trace Id
          type: string
        root_span_id:
          title: Root Span Id
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
      required:
      - trace_id
      - root_span_id
      - start_time
      title: Trace
      type: object
    EventType:
      description: The type of telemetry event being logged.
      enum:
      - unstructured_log
      - structured_log
      - metric
      title: EventType
      type: string
    StructuredLogType:
      description: The type of structured log event payload.
      enum:
      - span_start
      - span_end
      title: StructuredLogType
      type: string
    EvalTrace:
      description: A trace record for evaluation purposes.
      properties:
        session_id:
          title: Session Id
          type: string
        step:
          title: Step
          type: string
        input:
          title: Input
          type: string
        output:
          title: Output
          type: string
        expected_output:
          title: Expected Output
          type: string
      required:
      - session_id
      - step
      - input
      - output
      - expected_output
      title: EvalTrace
      type: object
    SpanWithStatus:
      description: A span that includes status information.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
        status:
          anyOf:
          - $ref: '#/components/schemas/SpanStatus'
            title: SpanStatus
          - type: 'null'
          nullable: true
          title: SpanStatus
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: SpanWithStatus
      type: object
    QueryConditionOp:
      description: Comparison operators for query conditions.
      enum:
      - eq
      - ne
      - gt
      - lt
      title: QueryConditionOp
      type: string
    QueryCondition:
      description: A condition for filtering query results.
      properties:
        key:
          title: Key
          type: string
        op:
          $ref: '#/components/schemas/QueryConditionOp'
        value:
          title: Value
      required:
      - key
      - op
      - value
      title: QueryCondition
      type: object
    MetricLabel:
      description: A label associated with a metric.
      properties:
        name:
          title: Name
          type: string
        value:
          title: Value
          type: string
      required:
      - name
      - value
      title: MetricLabel
      type: object
    MetricDataPoint:
      description: A single data point in a metric time series.
      properties:
        timestamp:
          title: Timestamp
          type: integer
        value:
          title: Value
          type: number
        unit:
          title: Unit
          type: string
      required:
      - timestamp
      - value
      - unit
      title: MetricDataPoint
      type: object
    MetricSeries:
      description: A time series of metric data points.
      properties:
        metric:
          title: Metric
          type: string
        labels:
          items:
            $ref: '#/components/schemas/MetricLabel'
          title: Labels
          type: array
        values:
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          title: Values
          type: array
      required:
      - metric
      - labels
      - values
      title: MetricSeries
      type: object
  responses:
    BadRequest400:
      description: The request was invalid or malformed
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -10,203 +10,34 @@ import TabItem from '@theme/TabItem';
 # Telemetry
-The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.
+The preferred way to instrument Llama Stack is with OpenTelemetry. Llama Stack enriches the data
 collected by OpenTelemetry to capture helpful information about the performance and behavior of your
 application. Here is an example of how to forward your telemetry to an OTLP collector from Llama Stack:
 ```sh
 export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:4318"
 export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
 export OTEL_SERVICE_NAME="llama-stack-server"
-## Automatic Metrics Generation
+uv pip install opentelemetry-distro opentelemetry-exporter-otlp
 uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
-Llama Stack automatically generates metrics during inference operations. These metrics are aggregated at the **inference request level** and provide insights into token usage and model performance.
+uv run opentelemetry-instrument llama stack run run.yaml
 ### Available Metrics
 The following metrics are automatically generated for each inference request:
 | Metric Name | Type | Unit | Description | Labels |
 |-------------|------|------|-------------|--------|
 | `llama_stack_prompt_tokens_total` | Counter | `tokens` | Number of tokens in the input prompt | `model_id`, `provider_id` |
 | `llama_stack_completion_tokens_total` | Counter | `tokens` | Number of tokens in the generated response | `model_id`, `provider_id` |
 | `llama_stack_tokens_total` | Counter | `tokens` | Total tokens used (prompt + completion) | `model_id`, `provider_id` |
 ### Metric Generation Flow
 1. **Token Counting**: During inference operations (chat completion, completion, etc.), the system counts tokens in both input prompts and generated responses
 2. **Metric Construction**: For each request, `MetricEvent` objects are created with the token counts
 3. **Telemetry Logging**: Metrics are sent to the configured telemetry sinks
 4. **OpenTelemetry Export**: When OpenTelemetry is enabled, metrics are exposed as standard OpenTelemetry counters
 ### Metric Aggregation Level
 All metrics are generated and aggregated at the **inference request level**. This means:
 - Each individual inference request generates its own set of metrics
 - Metrics are not pre-aggregated across multiple requests
 - Aggregation (sums, averages, etc.) can be performed by your observability tools (Prometheus, Grafana, etc.)
 - Each metric includes labels for `model_id` and `provider_id` to enable filtering and grouping
 ### Example Metric Event
 ```python
 MetricEvent(
    trace_id="1234567890abcdef",
    span_id="abcdef1234567890",
    metric="total_tokens",
    value=150,
    timestamp=1703123456.789,
    unit="tokens",
    attributes={
        "model_id": "meta-llama/Llama-3.2-3B-Instruct",
        "provider_id": "tgi"
    },
 )
 ```
 ## Telemetry Sinks
-Choose from multiple sink types based on your observability needs:
+### Known issues
-<Tabs>
+Some database instrumentation libraries have a known bug where spans get wrapped twice, or do not get connected to a trace.
-<TabItem value="opentelemetry" label="OpenTelemetry">
+To prevent this, you can disable database specific tracing, and rely just on the SQLAlchemy tracing. If you are using
 `sqlite3` as your database, for example, you can disable the additional tracing like this:
-Send events to an OpenTelemetry Collector for integration with observability platforms:
+```sh
-
+export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"
 **Use Cases:**
 - Visualizing traces in tools like Jaeger
 - Collecting metrics for Prometheus
 - Integration with enterprise observability stacks
 **Features:**
 - Standard OpenTelemetry format
 - Compatible with all OpenTelemetry collectors
 - Supports both traces and metrics
 </TabItem>
 <TabItem value="console" label="Console">
 Print events to the console for immediate debugging:
 **Use Cases:**
 - Development and testing
 - Quick debugging sessions
 - Simple logging without external tools
 **Features:**
 - Immediate output visibility
 - No setup required
 - Human-readable format
 </TabItem>
 </Tabs>
 ## Configuration
 ### Meta-Reference Provider
 Currently, only the meta-reference provider is implemented. It can be configured to send events to multiple sink types:
 ```yaml
 telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      service_name: "llama-stack-service"
      sinks: ['console', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: "http://localhost:4318"
 ```
 ### Environment Variables
 Configure telemetry behavior using environment variables:
 - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
 - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)
 ### Quick Setup: Complete Telemetry Stack
 Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
 ```bash
 ./scripts/telemetry/setup_telemetry.sh
 ```
 This sets up:
 - **Jaeger UI**: http://localhost:16686 (traces visualization)
 - **Prometheus**: http://localhost:9090 (metrics)
 - **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
 - **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
 Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
 ## Querying Metrics
 When using the OpenTelemetry sink, metrics are exposed in standard format and can be queried through various tools:
 <Tabs>
 <TabItem value="prometheus" label="Prometheus Queries">
 Example Prometheus queries for analyzing token usage:
 ```promql
 # Total tokens used across all models
 sum(llama_stack_tokens_total)
 # Tokens per model
 sum by (model_id) (llama_stack_tokens_total)
 # Average tokens per request over 5 minutes
 rate(llama_stack_tokens_total[5m])
 # Token usage by provider
 sum by (provider_id) (llama_stack_tokens_total)
 ```
 </TabItem>
 <TabItem value="grafana" label="Grafana Dashboards">
 Create dashboards using Prometheus as a data source:
 - **Token Usage Over Time**: Line charts showing token consumption trends
 - **Model Performance**: Comparison of different models by token efficiency
 - **Provider Analysis**: Breakdown of usage across different providers
 - **Request Patterns**: Understanding peak usage times and patterns
 </TabItem>
 <TabItem value="otlp" label="OpenTelemetry Collector">
 Forward metrics to other observability systems:
 - Export to multiple backends simultaneously
 - Apply transformations and filtering
 - Integrate with existing monitoring infrastructure
 </TabItem>
 </Tabs>
 ## Best Practices
 ### 🔍 **Monitoring Strategy**
 - Use OpenTelemetry for production environments
 - Set up alerts on key metrics like token usage and error rates
 ### 📊 **Metrics Analysis**
 - Track token usage trends to optimize costs
 - Monitor response times across different models
 - Analyze usage patterns to improve resource allocation
 ### 🚨 **Alerting & Debugging**
 - Set up alerts for unusual token consumption spikes
 - Use trace data to debug performance issues
 - Monitor error rates and failure patterns
 ### 🔧 **Configuration Management**
 - Use environment variables for flexible deployment
 - Ensure proper network access to OpenTelemetry collectors
 ## Related Resources
 - **[Agents](./agent)** - Monitoring agent execution with telemetry
 - **[Evaluations](./evals)** - Using telemetry data for performance evaluation
 - **[Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)** - Telemetry examples and queries
 - **[OpenTelemetry Documentation](https://opentelemetry.io/)** - Comprehensive observability framework
 - **[Jaeger Documentation](https://www.jaegertracing.io/)** - Distributed tracing visualization
--- a/docs/docs/concepts/apis/index.mdx
+++ b/docs/docs/concepts/apis/index.mdx
@ -17,7 +17,6 @@ A Llama Stack API is described as a collection of REST endpoints following OpenA
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
 - **Files**: manage file uploads, storage, and retrieval
 - **Telemetry**: collect telemetry data from the system
 - **Post Training**: fine-tune a model
 - **Tool Runtime**: interact with various tools and protocols
 - **Responses**: generate responses from an LLM
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -8,7 +8,6 @@ data:
    - inference
    - files
    - safety
    - telemetry
    - tool_runtime
    - vector_io
    providers:
@ -73,12 +72,6 @@ data:
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -140,8 +140,6 @@ server:
  auth:
    provider_config:
      type: github_token
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: chromadb
  default_embedding_model:
--- a/docs/docs/distributions/list_of_distributions.mdx
+++ b/docs/docs/distributions/list_of_distributions.mdx
@ -28,7 +28,7 @@ Llama Stack provides several pre-configured distributions to help you get starte
 - Run locally with Ollama for development
 ```bash
-docker pull llama-stack/distribution-starter
+docker pull llamastack/distribution-starter
 ```
 **Guides:** [Starter Distribution Guide](self_hosted_distro/starter)
@ -41,7 +41,7 @@ docker pull llama-stack/distribution-starter
 - Need to run inference locally
 ```bash
-docker pull llama-stack/distribution-meta-reference-gpu
+docker pull llamastack/distribution-meta-reference-gpu
 ```
 **Guides:** [Meta Reference GPU Guide](self_hosted_distro/meta-reference-gpu)
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -116,10 +116,6 @@ The following environment variables can be configured:
 - `BRAVE_SEARCH_API_KEY`: Brave Search API key
 - `TAVILY_SEARCH_API_KEY`: Tavily Search API key
 ### Telemetry Configuration
 - `OTEL_SERVICE_NAME`: OpenTelemetry service name
 - `OTEL_EXPORTER_OTLP_ENDPOINT`: OpenTelemetry collector endpoint URL
 ## Enabling Providers
 You can enable specific providers by setting appropriate environment variables. For example,
@ -265,7 +261,7 @@ The starter distribution uses SQLite for local storage of various components:
 2. **Flexible Configuration**: Easy to enable/disable providers based on your needs
 3. **No Local GPU Required**: Most providers are cloud-based, making it accessible to developers without high-end hardware
 4. **Easy Migration**: Start with hosted providers and gradually move to local ones as needed
-5. **Production Ready**: Includes safety, evaluation, and telemetry components
+5. **Production Ready**: Includes safety and evaluation
 6. **Tool Integration**: Comes with web search, RAG, and model context protocol tools
 The starter distribution is ideal for developers who want to experiment with different AI providers, build prototypes quickly, or create applications that can work with multiple AI backends.
--- a/docs/docs/distributions/starting_llama_stack_server.mdx
+++ b/docs/docs/distributions/starting_llama_stack_server.mdx
@ -27,7 +27,7 @@ If you have built a container image and want to deploy it in a Kubernetes cluste
 Control log output via environment variables before starting the server.
- `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug;core=info`.
+- `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug,core=info`.
 - Supported categories: `all`, `core`, `server`, `router`, `inference`, `agents`, `safety`, `eval`, `tools`, `client`.
 - Levels: `debug`, `info`, `warning`, `error`, `critical` (default is `info`). Use `all=<level>` to apply globally.
 - `LLAMA_STACK_LOG_FILE=/path/to/log` mirrors logs to a file while still printing to stdout.
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@ -360,32 +360,6 @@ Methods:
 - <code title="post /v1/synthetic-data-generation/generate">client.synthetic_data_generation.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/synthetic_data_generation.py">generate</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/synthetic_data_generation_generate_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/synthetic_data_generation_response.py">SyntheticDataGenerationResponse</a></code>
 ## Telemetry
 Types:
 ```python
 from llama_stack_client.types import (
    QuerySpansResponse,
    SpanWithStatus,
    Trace,
    TelemetryGetSpanResponse,
    TelemetryGetSpanTreeResponse,
    TelemetryQuerySpansResponse,
    TelemetryQueryTracesResponse,
 )
 ```
 Methods:
 - <code title="get /v1/telemetry/traces/{trace_id}/spans/{span_id}">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">get_span</a>(span_id, \*, trace_id) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_get_span_response.py">TelemetryGetSpanResponse</a></code>
 - <code title="get /v1/telemetry/spans/{span_id}/tree">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">get_span_tree</a>(span_id, \*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_get_span_tree_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_get_span_tree_response.py">TelemetryGetSpanTreeResponse</a></code>
 - <code title="get /v1/telemetry/traces/{trace_id}">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">get_trace</a>(trace_id) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/trace.py">Trace</a></code>
 - <code title="post /v1/telemetry/events">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">log_event</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_log_event_params.py">params</a>) -> None</code>
 - <code title="get /v1/telemetry/spans">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">query_spans</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_query_spans_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_query_spans_response.py">TelemetryQuerySpansResponse</a></code>
 - <code title="get /v1/telemetry/traces">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">query_traces</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_query_traces_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_query_traces_response.py">TelemetryQueryTracesResponse</a></code>
 - <code title="post /v1/telemetry/spans/export">client.telemetry.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/telemetry.py">save_spans_to_dataset</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/telemetry_save_spans_to_dataset_params.py">params</a>) -> None</code>
 ## Datasetio
 Types:
--- a/docs/src/pages/index.js
+++ b/docs/src/pages/index.js
@ -13,7 +13,7 @@ function HomepageHeader() {
        <div className={styles.heroContent}>
          <h1 className={styles.heroTitle}>Build AI Applications with Llama Stack</h1>
          <p className={styles.heroSubtitle}>
-            Unified APIs for Inference, RAG, Agents, Tools, Safety, and Telemetry
+            Unified APIs for Inference, RAG, Agents, Tools, and Safety
          </p>
          <div className={styles.buttons}>
            <Link
@ -206,7 +206,7 @@ export default function Home() {
  return (
    <Layout
      title="Build AI Applications"
-      description="The open-source framework for building generative AI applications with unified APIs for Inference, RAG, Agents, Tools, Safety, and Telemetry.">
+      description="The open-source framework for building generative AI applications with unified APIs for Inference, RAG, Agents, Tools, Safety, and Evals.">
      <HomepageHeader />
      <main>
        <QuickStart />
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -378,6 +378,91 @@ paths:
          type: string
        description: 'Path parameter: identifier'
      deprecated: true
  /v1/tool-runtime/invoke:
    post:
      responses:
        '200':
          description: A ToolInvocationResult.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ToolInvocationResult'
        '400':
          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
        '429':
          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
        '500':
          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
        default:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
      tags:
      - Tool Runtime
      summary: Invoke Tool
      description: Run a tool with the given arguments.
      operationId: invoke_tool_v1_tool_runtime_invoke_post
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
      deprecated: true
  /v1/tool-runtime/list-tools:
    get:
      responses:
        '200':
          description: A ListToolDefsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListToolDefsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
          description: Bad Request
        '429':
          $ref: '#/components/responses/TooManyRequests429'
          description: Too Many Requests
        '500':
          $ref: '#/components/responses/InternalServerError500'
          description: Internal Server Error
        default:
          $ref: '#/components/responses/DefaultError'
          description: Default Response
      tags:
      - Tool Runtime
      summary: List Runtime Tools
      description: List all tools in the runtime.
      operationId: list_runtime_tools_v1_tool_runtime_list_tools_get
      parameters:
      - name: authorization
        in: query
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Authorization
      - name: tool_group_id
        in: query
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Tool Group Id
      - name: mcp_endpoint
        in: query
        required: false
        schema:
          anyOf:
          - $ref: '#/components/schemas/URL'
          - type: 'null'
          title: Mcp Endpoint
      deprecated: true
  /v1/toolgroups:
    get:
      responses:
@ -404,6 +489,7 @@ paths:
      summary: List Tool Groups
      description: List tool groups with optional provider.
      operationId: list_tool_groups_v1_toolgroups_get
      deprecated: true
    post:
      responses:
        '400':
@ -465,6 +551,7 @@ paths:
        schema:
          type: string
        description: 'Path parameter: toolgroup_id'
      deprecated: true
    delete:
      responses:
        '400':
@ -494,6 +581,76 @@ paths:
          type: string
        description: 'Path parameter: toolgroup_id'
      deprecated: true
  /v1/tools:
    get:
      responses:
        '200':
          description: A ListToolDefsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListToolDefsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
          description: Bad Request
        '429':
          $ref: '#/components/responses/TooManyRequests429'
          description: Too Many Requests
        '500':
          $ref: '#/components/responses/InternalServerError500'
          description: Internal Server Error
        default:
          $ref: '#/components/responses/DefaultError'
          description: Default Response
      tags:
      - Tool Groups
      summary: List Tools
      description: List tools with optional tool group.
      operationId: list_tools_v1_tools_get
      parameters:
      - name: toolgroup_id
        in: query
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Toolgroup Id
      deprecated: true
  /v1/tools/{tool_name}:
    get:
      responses:
        '200':
          description: A ToolDef.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ToolDef'
        '400':
          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
        '429':
          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
        '500':
          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
        default:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
      tags:
      - Tool Groups
      summary: Get Tool
      description: Get a tool by its name.
      operationId: get_tool_v1_tools__tool_name__get
      parameters:
      - name: tool_name
        in: path
        required: true
        schema:
          type: string
        description: 'Path parameter: tool_name'
      deprecated: true
  /v1beta/datasets:
    get:
      responses:
@ -3645,6 +3802,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
        input:
          items:
            anyOf:
@ -4048,6 +4211,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - input
@ -4179,6 +4348,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - created_at
@ -9029,227 +9204,6 @@ components:
      - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
        title: OpenAIResponseContentPartReasoningText
      title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
    SpanEndPayload:
      description: Payload for a span end event.
      properties:
        type:
          const: span_end
          default: span_end
          title: Type
          type: string
        status:
          $ref: '#/components/schemas/SpanStatus'
      required:
      - status
      title: SpanEndPayload
      type: object
    SpanStartPayload:
      description: Payload for a span start event.
      properties:
        type:
          const: span_start
          default: span_start
          title: Type
          type: string
        name:
          title: Name
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
      required:
      - name
      title: SpanStartPayload
      type: object
    SpanStatus:
      description: The status of a span indicating whether it completed successfully or with an error.
      enum:
      - ok
      - error
      title: SpanStatus
      type: string
    StructuredLogPayload:
      discriminator:
        mapping:
          span_end: '#/components/schemas/SpanEndPayload'
          span_start: '#/components/schemas/SpanStartPayload'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/SpanStartPayload'
        title: SpanStartPayload
      - $ref: '#/components/schemas/SpanEndPayload'
        title: SpanEndPayload
      title: SpanStartPayload | SpanEndPayload
    LogSeverity:
      description: The severity level of a log message.
      enum:
      - verbose
      - debug
      - info
      - warn
      - error
      - critical
      title: LogSeverity
      type: string
    MetricEvent:
      description: A metric event containing a measured value.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: metric
          default: metric
          title: Type
          type: string
        metric:
          title: Metric
          type: string
        value:
          anyOf:
          - type: integer
          - type: number
          title: integer | number
        unit:
          title: Unit
          type: string
      required:
      - trace_id
      - span_id
      - timestamp
      - metric
      - value
      - unit
      title: MetricEvent
      type: object
    StructuredLogEvent:
      description: A structured log event containing typed payload data.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: structured_log
          default: structured_log
          title: Type
          type: string
        payload:
          discriminator:
            mapping:
              span_end: '#/components/schemas/SpanEndPayload'
              span_start: '#/components/schemas/SpanStartPayload'
            propertyName: type
          oneOf:
          - $ref: '#/components/schemas/SpanStartPayload'
            title: SpanStartPayload
          - $ref: '#/components/schemas/SpanEndPayload'
            title: SpanEndPayload
          title: SpanStartPayload | SpanEndPayload
      required:
      - trace_id
      - span_id
      - timestamp
      - payload
      title: StructuredLogEvent
      type: object
    UnstructuredLogEvent:
      description: An unstructured log event containing a simple text message.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: unstructured_log
          default: unstructured_log
          title: Type
          type: string
        message:
          title: Message
          type: string
        severity:
          $ref: '#/components/schemas/LogSeverity'
      required:
      - trace_id
      - span_id
      - timestamp
      - message
      - severity
      title: UnstructuredLogEvent
      type: object
    Event:
      discriminator:
        mapping:
          metric: '#/components/schemas/MetricEvent'
          structured_log: '#/components/schemas/StructuredLogEvent'
          unstructured_log: '#/components/schemas/UnstructuredLogEvent'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/UnstructuredLogEvent'
        title: UnstructuredLogEvent
      - $ref: '#/components/schemas/MetricEvent'
        title: MetricEvent
      - $ref: '#/components/schemas/StructuredLogEvent'
        title: StructuredLogEvent
      title: UnstructuredLogEvent | MetricEvent | StructuredLogEvent
    MetricInResponse:
      description: A metric value included in API responses.
      properties:
@ -10112,236 +10066,6 @@ components:
      - logger_config
      title: PostTrainingRLHFRequest
      type: object
    Span:
      description: A span representing a single operation within a trace.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: Span
      type: object
    Trace:
      description: A trace representing the complete execution path of a request across multiple operations.
      properties:
        trace_id:
          title: Trace Id
          type: string
        root_span_id:
          title: Root Span Id
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
      required:
      - trace_id
      - root_span_id
      - start_time
      title: Trace
      type: object
    EventType:
      description: The type of telemetry event being logged.
      enum:
      - unstructured_log
      - structured_log
      - metric
      title: EventType
      type: string
    StructuredLogType:
      description: The type of structured log event payload.
      enum:
      - span_start
      - span_end
      title: StructuredLogType
      type: string
    EvalTrace:
      description: A trace record for evaluation purposes.
      properties:
        session_id:
          title: Session Id
          type: string
        step:
          title: Step
          type: string
        input:
          title: Input
          type: string
        output:
          title: Output
          type: string
        expected_output:
          title: Expected Output
          type: string
      required:
      - session_id
      - step
      - input
      - output
      - expected_output
      title: EvalTrace
      type: object
    SpanWithStatus:
      description: A span that includes status information.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
        status:
          anyOf:
          - $ref: '#/components/schemas/SpanStatus'
            title: SpanStatus
          - type: 'null'
          nullable: true
          title: SpanStatus
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: SpanWithStatus
      type: object
    QueryConditionOp:
      description: Comparison operators for query conditions.
      enum:
      - eq
      - ne
      - gt
      - lt
      title: QueryConditionOp
      type: string
    QueryCondition:
      description: A condition for filtering query results.
      properties:
        key:
          title: Key
          type: string
        op:
          $ref: '#/components/schemas/QueryConditionOp'
        value:
          title: Value
      required:
      - key
      - op
      - value
      title: QueryCondition
      type: object
    MetricLabel:
      description: A label associated with a metric.
      properties:
        name:
          title: Name
          type: string
        value:
          title: Value
          type: string
      required:
      - name
      - value
      title: MetricLabel
      type: object
    MetricDataPoint:
      description: A single data point in a metric time series.
      properties:
        timestamp:
          title: Timestamp
          type: integer
        value:
          title: Value
          type: number
        unit:
          title: Unit
          type: string
      required:
      - timestamp
      - value
      - unit
      title: MetricDataPoint
      type: object
    MetricSeries:
      description: A time series of metric data points.
      properties:
        metric:
          title: Metric
          type: string
        labels:
          items:
            $ref: '#/components/schemas/MetricLabel'
          title: Labels
          type: array
        values:
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          title: Values
          type: array
      required:
      - metric
      - labels
      - values
      title: MetricSeries
      type: object
  responses:
    BadRequest400:
      description: The request was invalid or malformed
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -3370,6 +3370,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
        input:
          items:
            anyOf:
@ -3770,6 +3776,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - created_at
@ -7986,227 +7998,6 @@ components:
      - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
        title: OpenAIResponseContentPartReasoningText
      title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
    SpanEndPayload:
      description: Payload for a span end event.
      properties:
        type:
          const: span_end
          default: span_end
          title: Type
          type: string
        status:
          $ref: '#/components/schemas/SpanStatus'
      required:
      - status
      title: SpanEndPayload
      type: object
    SpanStartPayload:
      description: Payload for a span start event.
      properties:
        type:
          const: span_start
          default: span_start
          title: Type
          type: string
        name:
          title: Name
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
      required:
      - name
      title: SpanStartPayload
      type: object
    SpanStatus:
      description: The status of a span indicating whether it completed successfully or with an error.
      enum:
      - ok
      - error
      title: SpanStatus
      type: string
    StructuredLogPayload:
      discriminator:
        mapping:
          span_end: '#/components/schemas/SpanEndPayload'
          span_start: '#/components/schemas/SpanStartPayload'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/SpanStartPayload'
        title: SpanStartPayload
      - $ref: '#/components/schemas/SpanEndPayload'
        title: SpanEndPayload
      title: SpanStartPayload | SpanEndPayload
    LogSeverity:
      description: The severity level of a log message.
      enum:
      - verbose
      - debug
      - info
      - warn
      - error
      - critical
      title: LogSeverity
      type: string
    MetricEvent:
      description: A metric event containing a measured value.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: metric
          default: metric
          title: Type
          type: string
        metric:
          title: Metric
          type: string
        value:
          anyOf:
          - type: integer
          - type: number
          title: integer | number
        unit:
          title: Unit
          type: string
      required:
      - trace_id
      - span_id
      - timestamp
      - metric
      - value
      - unit
      title: MetricEvent
      type: object
    StructuredLogEvent:
      description: A structured log event containing typed payload data.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: structured_log
          default: structured_log
          title: Type
          type: string
        payload:
          discriminator:
            mapping:
              span_end: '#/components/schemas/SpanEndPayload'
              span_start: '#/components/schemas/SpanStartPayload'
            propertyName: type
          oneOf:
          - $ref: '#/components/schemas/SpanStartPayload'
            title: SpanStartPayload
          - $ref: '#/components/schemas/SpanEndPayload'
            title: SpanEndPayload
          title: SpanStartPayload | SpanEndPayload
      required:
      - trace_id
      - span_id
      - timestamp
      - payload
      title: StructuredLogEvent
      type: object
    UnstructuredLogEvent:
      description: An unstructured log event containing a simple text message.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: unstructured_log
          default: unstructured_log
          title: Type
          type: string
        message:
          title: Message
          type: string
        severity:
          $ref: '#/components/schemas/LogSeverity'
      required:
      - trace_id
      - span_id
      - timestamp
      - message
      - severity
      title: UnstructuredLogEvent
      type: object
    Event:
      discriminator:
        mapping:
          metric: '#/components/schemas/MetricEvent'
          structured_log: '#/components/schemas/StructuredLogEvent'
          unstructured_log: '#/components/schemas/UnstructuredLogEvent'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/UnstructuredLogEvent'
        title: UnstructuredLogEvent
      - $ref: '#/components/schemas/MetricEvent'
        title: MetricEvent
      - $ref: '#/components/schemas/StructuredLogEvent'
        title: StructuredLogEvent
      title: UnstructuredLogEvent | MetricEvent | StructuredLogEvent
    MetricInResponse:
      description: A metric value included in API responses.
      properties:
@ -9069,236 +8860,6 @@ components:
      - logger_config
      title: PostTrainingRLHFRequest
      type: object
    Span:
      description: A span representing a single operation within a trace.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: Span
      type: object
    Trace:
      description: A trace representing the complete execution path of a request across multiple operations.
      properties:
        trace_id:
          title: Trace Id
          type: string
        root_span_id:
          title: Root Span Id
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
      required:
      - trace_id
      - root_span_id
      - start_time
      title: Trace
      type: object
    EventType:
      description: The type of telemetry event being logged.
      enum:
      - unstructured_log
      - structured_log
      - metric
      title: EventType
      type: string
    StructuredLogType:
      description: The type of structured log event payload.
      enum:
      - span_start
      - span_end
      title: StructuredLogType
      type: string
    EvalTrace:
      description: A trace record for evaluation purposes.
      properties:
        session_id:
          title: Session Id
          type: string
        step:
          title: Step
          type: string
        input:
          title: Input
          type: string
        output:
          title: Output
          type: string
        expected_output:
          title: Expected Output
          type: string
      required:
      - session_id
      - step
      - input
      - output
      - expected_output
      title: EvalTrace
      type: object
    SpanWithStatus:
      description: A span that includes status information.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
        status:
          anyOf:
          - $ref: '#/components/schemas/SpanStatus'
            title: SpanStatus
          - type: 'null'
          nullable: true
          title: SpanStatus
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: SpanWithStatus
      type: object
    QueryConditionOp:
      description: Comparison operators for query conditions.
      enum:
      - eq
      - ne
      - gt
      - lt
      title: QueryConditionOp
      type: string
    QueryCondition:
      description: A condition for filtering query results.
      properties:
        key:
          title: Key
          type: string
        op:
          $ref: '#/components/schemas/QueryConditionOp'
        value:
          title: Value
      required:
      - key
      - op
      - value
      title: QueryCondition
      type: object
    MetricLabel:
      description: A label associated with a metric.
      properties:
        name:
          title: Name
          type: string
        value:
          title: Value
          type: string
      required:
      - name
      - value
      title: MetricLabel
      type: object
    MetricDataPoint:
      description: A single data point in a metric time series.
      properties:
        timestamp:
          title: Timestamp
          type: integer
        value:
          title: Value
          type: number
        unit:
          title: Unit
          type: string
      required:
      - timestamp
      - value
      - unit
      title: MetricDataPoint
      type: object
    MetricSeries:
      description: A time series of metric data points.
      properties:
        metric:
          title: Metric
          type: string
        labels:
          items:
            $ref: '#/components/schemas/MetricLabel'
          title: Labels
          type: array
        values:
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          title: Values
          type: array
      required:
      - metric
      - labels
      - values
      title: MetricSeries
      type: object
  responses:
    BadRequest400:
      description: The request was invalid or malformed
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -1882,216 +1882,6 @@ paths:
        schema:
          type: string
        description: 'Path parameter: identifier'
  /v1/tool-runtime/invoke:
    post:
      responses:
        '200':
          description: A ToolInvocationResult.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ToolInvocationResult'
        '400':
          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
        '429':
          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
        '500':
          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
        default:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
      tags:
      - Tool Runtime
      summary: Invoke Tool
      description: Run a tool with the given arguments.
      operationId: invoke_tool_v1_tool_runtime_invoke_post
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
  /v1/tool-runtime/list-tools:
    get:
      responses:
        '200':
          description: A ListToolDefsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListToolDefsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
          description: Bad Request
        '429':
          $ref: '#/components/responses/TooManyRequests429'
          description: Too Many Requests
        '500':
          $ref: '#/components/responses/InternalServerError500'
          description: Internal Server Error
        default:
          $ref: '#/components/responses/DefaultError'
          description: Default Response
      tags:
      - Tool Runtime
      summary: List Runtime Tools
      description: List all tools in the runtime.
      operationId: list_runtime_tools_v1_tool_runtime_list_tools_get
      parameters:
      - name: authorization
        in: query
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Authorization
      - name: tool_group_id
        in: query
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Tool Group Id
      - name: mcp_endpoint
        in: query
        required: false
        schema:
          anyOf:
          - $ref: '#/components/schemas/URL'
          - type: 'null'
          title: Mcp Endpoint
  /v1/toolgroups:
    get:
      responses:
        '200':
          description: A ListToolGroupsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListToolGroupsResponse'
        '400':
          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
        '429':
          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
        '500':
          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
        default:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
      tags:
      - Tool Groups
      summary: List Tool Groups
      description: List tool groups with optional provider.
      operationId: list_tool_groups_v1_toolgroups_get
  /v1/toolgroups/{toolgroup_id}:
    get:
      responses:
        '200':
          description: A ToolGroup.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ToolGroup'
        '400':
          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
        '429':
          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
        '500':
          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
        default:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
      tags:
      - Tool Groups
      summary: Get Tool Group
      description: Get a tool group by its ID.
      operationId: get_tool_group_v1_toolgroups__toolgroup_id__get
      parameters:
      - name: toolgroup_id
        in: path
        required: true
        schema:
          type: string
        description: 'Path parameter: toolgroup_id'
  /v1/tools:
    get:
      responses:
        '200':
          description: A ListToolDefsResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListToolDefsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
          description: Bad Request
        '429':
          $ref: '#/components/responses/TooManyRequests429'
          description: Too Many Requests
        '500':
          $ref: '#/components/responses/InternalServerError500'
          description: Internal Server Error
        default:
          $ref: '#/components/responses/DefaultError'
          description: Default Response
      tags:
      - Tool Groups
      summary: List Tools
      description: List tools with optional tool group.
      operationId: list_tools_v1_tools_get
      parameters:
      - name: toolgroup_id
        in: query
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Toolgroup Id
  /v1/tools/{tool_name}:
    get:
      responses:
        '200':
          description: A ToolDef.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ToolDef'
        '400':
          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
        '429':
          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
        '500':
          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
        default:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
      tags:
      - Tool Groups
      summary: Get Tool
      description: Get a tool by its name.
      operationId: get_tool_v1_tools__tool_name__get
      parameters:
      - name: tool_name
        in: path
        required: true
        schema:
          type: string
        description: 'Path parameter: tool_name'
  /v1/vector-io/insert:
    post:
      responses:
@ -5833,6 +5623,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
        input:
          items:
            anyOf:
@ -6236,6 +6032,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - input
@ -6367,6 +6169,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - created_at
@ -8131,24 +7939,6 @@ components:
      required:
      - data
      title: ListShieldsResponse
    InvokeToolRequest:
      properties:
        tool_name:
          type: string
          title: Tool Name
        kwargs:
          additionalProperties: true
          type: object
          title: Kwargs
        authorization:
          anyOf:
          - type: string
          - type: 'null'
      type: object
      required:
      - tool_name
      - kwargs
      title: InvokeToolRequest
    ImageContentItem:
      description: A image content item
      properties:
@ -10866,227 +10656,6 @@ components:
      - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
        title: OpenAIResponseContentPartReasoningText
      title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
    SpanEndPayload:
      description: Payload for a span end event.
      properties:
        type:
          const: span_end
          default: span_end
          title: Type
          type: string
        status:
          $ref: '#/components/schemas/SpanStatus'
      required:
      - status
      title: SpanEndPayload
      type: object
    SpanStartPayload:
      description: Payload for a span start event.
      properties:
        type:
          const: span_start
          default: span_start
          title: Type
          type: string
        name:
          title: Name
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
      required:
      - name
      title: SpanStartPayload
      type: object
    SpanStatus:
      description: The status of a span indicating whether it completed successfully or with an error.
      enum:
      - ok
      - error
      title: SpanStatus
      type: string
    StructuredLogPayload:
      discriminator:
        mapping:
          span_end: '#/components/schemas/SpanEndPayload'
          span_start: '#/components/schemas/SpanStartPayload'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/SpanStartPayload'
        title: SpanStartPayload
      - $ref: '#/components/schemas/SpanEndPayload'
        title: SpanEndPayload
      title: SpanStartPayload | SpanEndPayload
    LogSeverity:
      description: The severity level of a log message.
      enum:
      - verbose
      - debug
      - info
      - warn
      - error
      - critical
      title: LogSeverity
      type: string
    MetricEvent:
      description: A metric event containing a measured value.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: metric
          default: metric
          title: Type
          type: string
        metric:
          title: Metric
          type: string
        value:
          anyOf:
          - type: integer
          - type: number
          title: integer | number
        unit:
          title: Unit
          type: string
      required:
      - trace_id
      - span_id
      - timestamp
      - metric
      - value
      - unit
      title: MetricEvent
      type: object
    StructuredLogEvent:
      description: A structured log event containing typed payload data.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: structured_log
          default: structured_log
          title: Type
          type: string
        payload:
          discriminator:
            mapping:
              span_end: '#/components/schemas/SpanEndPayload'
              span_start: '#/components/schemas/SpanStartPayload'
            propertyName: type
          oneOf:
          - $ref: '#/components/schemas/SpanStartPayload'
            title: SpanStartPayload
          - $ref: '#/components/schemas/SpanEndPayload'
            title: SpanEndPayload
          title: SpanStartPayload | SpanEndPayload
      required:
      - trace_id
      - span_id
      - timestamp
      - payload
      title: StructuredLogEvent
      type: object
    UnstructuredLogEvent:
      description: An unstructured log event containing a simple text message.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: unstructured_log
          default: unstructured_log
          title: Type
          type: string
        message:
          title: Message
          type: string
        severity:
          $ref: '#/components/schemas/LogSeverity'
      required:
      - trace_id
      - span_id
      - timestamp
      - message
      - severity
      title: UnstructuredLogEvent
      type: object
    Event:
      discriminator:
        mapping:
          metric: '#/components/schemas/MetricEvent'
          structured_log: '#/components/schemas/StructuredLogEvent'
          unstructured_log: '#/components/schemas/UnstructuredLogEvent'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/UnstructuredLogEvent'
        title: UnstructuredLogEvent
      - $ref: '#/components/schemas/MetricEvent'
        title: MetricEvent
      - $ref: '#/components/schemas/StructuredLogEvent'
        title: StructuredLogEvent
      title: UnstructuredLogEvent | MetricEvent | StructuredLogEvent
    MetricInResponse:
      description: A metric value included in API responses.
      properties:
@ -11946,236 +11515,6 @@ components:
      - logger_config
      title: PostTrainingRLHFRequest
      type: object
    Span:
      description: A span representing a single operation within a trace.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: Span
      type: object
    Trace:
      description: A trace representing the complete execution path of a request across multiple operations.
      properties:
        trace_id:
          title: Trace Id
          type: string
        root_span_id:
          title: Root Span Id
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
      required:
      - trace_id
      - root_span_id
      - start_time
      title: Trace
      type: object
    EventType:
      description: The type of telemetry event being logged.
      enum:
      - unstructured_log
      - structured_log
      - metric
      title: EventType
      type: string
    StructuredLogType:
      description: The type of structured log event payload.
      enum:
      - span_start
      - span_end
      title: StructuredLogType
      type: string
    EvalTrace:
      description: A trace record for evaluation purposes.
      properties:
        session_id:
          title: Session Id
          type: string
        step:
          title: Step
          type: string
        input:
          title: Input
          type: string
        output:
          title: Output
          type: string
        expected_output:
          title: Expected Output
          type: string
      required:
      - session_id
      - step
      - input
      - output
      - expected_output
      title: EvalTrace
      type: object
    SpanWithStatus:
      description: A span that includes status information.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
        status:
          anyOf:
          - $ref: '#/components/schemas/SpanStatus'
            title: SpanStatus
          - type: 'null'
          nullable: true
          title: SpanStatus
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: SpanWithStatus
      type: object
    QueryConditionOp:
      description: Comparison operators for query conditions.
      enum:
      - eq
      - ne
      - gt
      - lt
      title: QueryConditionOp
      type: string
    QueryCondition:
      description: A condition for filtering query results.
      properties:
        key:
          title: Key
          type: string
        op:
          $ref: '#/components/schemas/QueryConditionOp'
        value:
          title: Value
      required:
      - key
      - op
      - value
      title: QueryCondition
      type: object
    MetricLabel:
      description: A label associated with a metric.
      properties:
        name:
          title: Name
          type: string
        value:
          title: Value
          type: string
      required:
      - name
      - value
      title: MetricLabel
      type: object
    MetricDataPoint:
      description: A single data point in a metric time series.
      properties:
        timestamp:
          title: Timestamp
          type: integer
        value:
          title: Value
          type: number
        unit:
          title: Unit
          type: string
      required:
      - timestamp
      - value
      - unit
      title: MetricDataPoint
      type: object
    MetricSeries:
      description: A time series of metric data points.
      properties:
        metric:
          title: Metric
          type: string
        labels:
          items:
            $ref: '#/components/schemas/MetricLabel'
          title: Labels
          type: array
        values:
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          title: Values
          type: array
      required:
      - metric
      - labels
      - values
      title: MetricSeries
      type: object
  responses:
    BadRequest400:
      description: The request was invalid or malformed
--- a/docs/static/openai-spec-2.3.0.yml
+++ b/docs/static/openai-spec-2.3.0.yml
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -2101,6 +2101,7 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
      deprecated: true
  /v1/tool-runtime/list-tools:
    get:
      responses:
@ -2152,6 +2153,7 @@ paths:
          - $ref: '#/components/schemas/URL'
          - type: 'null'
          title: Mcp Endpoint
      deprecated: true
  /v1/toolgroups:
    get:
      responses:
@ -2178,6 +2180,7 @@ paths:
      summary: List Tool Groups
      description: List tool groups with optional provider.
      operationId: list_tool_groups_v1_toolgroups_get
      deprecated: true
    post:
      responses:
        '400':
@ -2239,6 +2242,7 @@ paths:
        schema:
          type: string
        description: 'Path parameter: toolgroup_id'
      deprecated: true
    delete:
      responses:
        '400':
@ -2303,6 +2307,7 @@ paths:
          - type: string
          - type: 'null'
          title: Toolgroup Id
      deprecated: true
  /v1/tools/{tool_name}:
    get:
      responses:
@ -2336,6 +2341,7 @@ paths:
        schema:
          type: string
        description: 'Path parameter: tool_name'
      deprecated: true
  /v1/vector-io/insert:
    post:
      responses:
@ -6812,6 +6818,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
        input:
          items:
            anyOf:
@ -7215,6 +7227,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - input
@ -7346,6 +7364,12 @@ components:
          anyOf:
          - type: integer
          - type: 'null'
        metadata:
          anyOf:
          - additionalProperties:
              type: string
            type: object
          - type: 'null'
      type: object
      required:
      - created_at
@ -12196,227 +12220,6 @@ components:
      - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
        title: OpenAIResponseContentPartReasoningText
      title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
    SpanEndPayload:
      description: Payload for a span end event.
      properties:
        type:
          const: span_end
          default: span_end
          title: Type
          type: string
        status:
          $ref: '#/components/schemas/SpanStatus'
      required:
      - status
      title: SpanEndPayload
      type: object
    SpanStartPayload:
      description: Payload for a span start event.
      properties:
        type:
          const: span_start
          default: span_start
          title: Type
          type: string
        name:
          title: Name
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
      required:
      - name
      title: SpanStartPayload
      type: object
    SpanStatus:
      description: The status of a span indicating whether it completed successfully or with an error.
      enum:
      - ok
      - error
      title: SpanStatus
      type: string
    StructuredLogPayload:
      discriminator:
        mapping:
          span_end: '#/components/schemas/SpanEndPayload'
          span_start: '#/components/schemas/SpanStartPayload'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/SpanStartPayload'
        title: SpanStartPayload
      - $ref: '#/components/schemas/SpanEndPayload'
        title: SpanEndPayload
      title: SpanStartPayload | SpanEndPayload
    LogSeverity:
      description: The severity level of a log message.
      enum:
      - verbose
      - debug
      - info
      - warn
      - error
      - critical
      title: LogSeverity
      type: string
    MetricEvent:
      description: A metric event containing a measured value.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: metric
          default: metric
          title: Type
          type: string
        metric:
          title: Metric
          type: string
        value:
          anyOf:
          - type: integer
          - type: number
          title: integer | number
        unit:
          title: Unit
          type: string
      required:
      - trace_id
      - span_id
      - timestamp
      - metric
      - value
      - unit
      title: MetricEvent
      type: object
    StructuredLogEvent:
      description: A structured log event containing typed payload data.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: structured_log
          default: structured_log
          title: Type
          type: string
        payload:
          discriminator:
            mapping:
              span_end: '#/components/schemas/SpanEndPayload'
              span_start: '#/components/schemas/SpanStartPayload'
            propertyName: type
          oneOf:
          - $ref: '#/components/schemas/SpanStartPayload'
            title: SpanStartPayload
          - $ref: '#/components/schemas/SpanEndPayload'
            title: SpanEndPayload
          title: SpanStartPayload | SpanEndPayload
      required:
      - trace_id
      - span_id
      - timestamp
      - payload
      title: StructuredLogEvent
      type: object
    UnstructuredLogEvent:
      description: An unstructured log event containing a simple text message.
      properties:
        trace_id:
          title: Trace Id
          type: string
        span_id:
          title: Span Id
          type: string
        timestamp:
          format: date-time
          title: Timestamp
          type: string
        attributes:
          anyOf:
          - additionalProperties:
              anyOf:
              - type: string
              - type: integer
              - type: number
              - type: boolean
              - type: 'null'
              title: string | ... (4 variants)
            type: object
          - type: 'null'
        type:
          const: unstructured_log
          default: unstructured_log
          title: Type
          type: string
        message:
          title: Message
          type: string
        severity:
          $ref: '#/components/schemas/LogSeverity'
      required:
      - trace_id
      - span_id
      - timestamp
      - message
      - severity
      title: UnstructuredLogEvent
      type: object
    Event:
      discriminator:
        mapping:
          metric: '#/components/schemas/MetricEvent'
          structured_log: '#/components/schemas/StructuredLogEvent'
          unstructured_log: '#/components/schemas/UnstructuredLogEvent'
        propertyName: type
      oneOf:
      - $ref: '#/components/schemas/UnstructuredLogEvent'
        title: UnstructuredLogEvent
      - $ref: '#/components/schemas/MetricEvent'
        title: MetricEvent
      - $ref: '#/components/schemas/StructuredLogEvent'
        title: StructuredLogEvent
      title: UnstructuredLogEvent | MetricEvent | StructuredLogEvent
    MetricInResponse:
      description: A metric value included in API responses.
      properties:
@ -13279,236 +13082,6 @@ components:
      - logger_config
      title: PostTrainingRLHFRequest
      type: object
    Span:
      description: A span representing a single operation within a trace.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: Span
      type: object
    Trace:
      description: A trace representing the complete execution path of a request across multiple operations.
      properties:
        trace_id:
          title: Trace Id
          type: string
        root_span_id:
          title: Root Span Id
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
      required:
      - trace_id
      - root_span_id
      - start_time
      title: Trace
      type: object
    EventType:
      description: The type of telemetry event being logged.
      enum:
      - unstructured_log
      - structured_log
      - metric
      title: EventType
      type: string
    StructuredLogType:
      description: The type of structured log event payload.
      enum:
      - span_start
      - span_end
      title: StructuredLogType
      type: string
    EvalTrace:
      description: A trace record for evaluation purposes.
      properties:
        session_id:
          title: Session Id
          type: string
        step:
          title: Step
          type: string
        input:
          title: Input
          type: string
        output:
          title: Output
          type: string
        expected_output:
          title: Expected Output
          type: string
      required:
      - session_id
      - step
      - input
      - output
      - expected_output
      title: EvalTrace
      type: object
    SpanWithStatus:
      description: A span that includes status information.
      properties:
        span_id:
          title: Span Id
          type: string
        trace_id:
          title: Trace Id
          type: string
        parent_span_id:
          anyOf:
          - type: string
          - type: 'null'
          nullable: true
        name:
          title: Name
          type: string
        start_time:
          format: date-time
          title: Start Time
          type: string
        end_time:
          anyOf:
          - format: date-time
            type: string
          - type: 'null'
          nullable: true
        attributes:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
        status:
          anyOf:
          - $ref: '#/components/schemas/SpanStatus'
            title: SpanStatus
          - type: 'null'
          nullable: true
          title: SpanStatus
      required:
      - span_id
      - trace_id
      - name
      - start_time
      title: SpanWithStatus
      type: object
    QueryConditionOp:
      description: Comparison operators for query conditions.
      enum:
      - eq
      - ne
      - gt
      - lt
      title: QueryConditionOp
      type: string
    QueryCondition:
      description: A condition for filtering query results.
      properties:
        key:
          title: Key
          type: string
        op:
          $ref: '#/components/schemas/QueryConditionOp'
        value:
          title: Value
      required:
      - key
      - op
      - value
      title: QueryCondition
      type: object
    MetricLabel:
      description: A label associated with a metric.
      properties:
        name:
          title: Name
          type: string
        value:
          title: Value
          type: string
      required:
      - name
      - value
      title: MetricLabel
      type: object
    MetricDataPoint:
      description: A single data point in a metric time series.
      properties:
        timestamp:
          title: Timestamp
          type: integer
        value:
          title: Value
          type: number
        unit:
          title: Unit
          type: string
      required:
      - timestamp
      - value
      - unit
      title: MetricDataPoint
      type: object
    MetricSeries:
      description: A time series of metric data points.
      properties:
        metric:
          title: Metric
          type: string
        labels:
          items:
            $ref: '#/components/schemas/MetricLabel'
          title: Labels
          type: array
        values:
          items:
            $ref: '#/components/schemas/MetricDataPoint'
          title: Values
          type: array
      required:
      - metric
      - labels
      - values
      title: MetricSeries
      type: object
  responses:
    BadRequest400:
      description: The request was invalid or malformed
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -171,10 +171,18 @@ if [[ "$COLLECT_ONLY" == false ]]; then
    # Set MCP host for in-process MCP server tests
    # - For library client and server mode: localhost (both on same host)
-    # - For docker mode: host.docker.internal (container needs to reach host)
+    # - For docker mode on Linux: localhost (container uses host network, shares network namespace)
    # - For docker mode on macOS/Windows: host.docker.internal (container uses bridge network)
    if [[ "$STACK_CONFIG" == docker:* ]]; then
-        export LLAMA_STACK_TEST_MCP_HOST="host.docker.internal"
+        if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
-        echo "Setting MCP host: host.docker.internal (docker mode)"
+            # On Linux with host network mode, container shares host network namespace
            export LLAMA_STACK_TEST_MCP_HOST="localhost"
            echo "Setting MCP host: localhost (docker mode with host network)"
        else
            # On macOS/Windows with bridge network, need special host access
            export LLAMA_STACK_TEST_MCP_HOST="host.docker.internal"
            echo "Setting MCP host: host.docker.internal (docker mode with bridge network)"
        fi
    else
        export LLAMA_STACK_TEST_MCP_HOST="localhost"
        echo "Setting MCP host: localhost (library/server mode)"
--- a/scripts/openapi_generator/schema_collection.py
+++ b/scripts/openapi_generator/schema_collection.py
@ -8,7 +8,6 @@
 Schema discovery and collection for OpenAPI generation.
 """
 import importlib
 from typing import Any
@ -20,23 +19,6 @@ def _ensure_components_schemas(openapi_schema: dict[str, Any]) -> None:
        openapi_schema["components"]["schemas"] = {}
 def _load_extra_schema_modules() -> None:
    """
    Import modules outside llama_stack_api that use schema_utils to register schemas.
    The API package already imports its submodules via __init__, but server-side modules
    like telemetry need to be imported explicitly so their decorator side effects run.
    """
    extra_modules = [
        "llama_stack.core.telemetry.telemetry",
    ]
    for module_name in extra_modules:
        try:
            importlib.import_module(module_name)
        except ImportError:
            continue
 def _extract_and_fix_defs(schema: dict[str, Any], openapi_schema: dict[str, Any]) -> None:
    """
    Extract $defs from a schema, move them to components/schemas, and fix references.
@ -79,9 +61,6 @@ def _ensure_json_schema_types_included(openapi_schema: dict[str, Any]) -> dict[s
        iter_registered_schema_types,
    )
    # Import extra modules (e.g., telemetry) whose schema registrations live outside llama_stack_api
    _load_extra_schema_modules()
    # Handle explicitly registered schemas first (union types, Annotated structs, etc.)
    for registration_info in iter_registered_schema_types():
        schema_type = registration_info.type
--- a/scripts/telemetry/llama-stack-dashboard.json
+++ b/scripts/telemetry/llama-stack-dashboard.json
@ -1,11 +1,24 @@
 {
  "annotations": {
-    "list": []
+    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "grafana",
          "uid": "-- Grafana --"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
-  "id": null,
+  "id": 1,
  "links": [],
  "liveNow": false,
  "panels": [
@ -16,11 +29,40 @@
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
-            "fillOpacity": 10
+            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
@ -32,7 +74,8 @@
              }
            ]
          }
-        }
+        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
@ -40,15 +83,16 @@
        "x": 0,
        "y": 0
      },
-      "id": 1,
+      "id": 2,
      "options": {
        "legend": {
          "calcs": [],
-          "displayMode": "table",
+          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "maxHeight": 600,
          "mode": "multi",
          "sort": "none"
        }
@ -59,9 +103,112 @@
            "type": "prometheus",
            "uid": "prometheus"
          },
-          "expr": "llama_stack_completion_tokens_total",
+          "disableTextWrap": false,
-          "legendFormat": "{{model_id}} ({{provider_id}})",
+          "editorMode": "builder",
-          "refId": "A"
+          "expr": "sum by(gen_ai_request_model) (llama_stack_gen_ai_client_token_usage_sum{gen_ai_token_type=\"input\"})",
          "fullMetaSearch": false,
          "includeNullMetadata": true,
          "legendFormat": "__auto",
          "range": true,
          "refId": "A",
          "useBackend": false
        }
      ],
      "title": "Prompt Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 0
      },
      "id": 1,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "maxHeight": 600,
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
          "exemplar": false,
          "expr": "sum by(gen_ai_request_model) (llama_stack_gen_ai_client_token_usage_sum{gen_ai_token_type=\"output\"})",
          "fullMetaSearch": false,
          "includeNullMetadata": true,
          "interval": "",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A",
          "useBackend": false
        }
      ],
      "title": "Completion Tokens",
@ -74,78 +221,40 @@
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
-            "fillOpacity": 10
+            "spanNulls": false,
-          },
+            "stacking": {
-          "mappings": [],
+              "group": "A",
-          "thresholds": {
+              "mode": "none"
-            "mode": "absolute",
+            },
-            "steps": [
+            "thresholdsStyle": {
-              {
+              "mode": "off"
-                "color": "green",
+            }
                "value": null
              }
            ]
          }
        }
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 0
      },
      "id": 2,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "llama_stack_prompt_tokens_total",
          "legendFormat": "Prompt - {{model_id}}",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "llama_stack_tokens_total",
          "legendFormat": "Total - {{model_id}}",
          "refId": "B"
        }
      ],
      "title": "Prompt & Total Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
@ -158,7 +267,8 @@
            ]
          },
          "unit": "ms"
-        }
+        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
@ -175,6 +285,7 @@
          "showLegend": true
        },
        "tooltip": {
          "maxHeight": 600,
          "mode": "multi",
          "sort": "none"
        }
@ -219,7 +330,8 @@
              }
            ]
          }
-        }
+        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
@ -240,8 +352,11 @@
          "fields": "",
          "values": false
        },
-        "textMode": "auto"
+        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "datasource": {
@ -272,7 +387,8 @@
              }
            ]
          }
-        }
+        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
@ -293,8 +409,11 @@
          "fields": "",
          "values": false
        },
-        "textMode": "auto"
+        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "datasource": {
@ -315,11 +434,40 @@
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
-            "fillOpacity": 10
+            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
@ -332,7 +480,8 @@
            ]
          },
          "unit": "reqps"
-        }
+        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
@ -349,6 +498,7 @@
          "showLegend": true
        },
        "tooltip": {
          "maxHeight": 600,
          "mode": "multi",
          "sort": "none"
        }
@ -374,11 +524,40 @@
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
-            "fillOpacity": 10
+            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
@ -391,7 +570,8 @@
            ]
          },
          "unit": "Bps"
-        }
+        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
@ -408,6 +588,7 @@
          "showLegend": true
        },
        "tooltip": {
          "maxHeight": 600,
          "mode": "multi",
          "sort": "none"
        }
@ -437,7 +618,7 @@
    }
  ],
  "refresh": "5s",
-  "schemaVersion": 38,
+  "schemaVersion": 39,
  "tags": [
    "llama-stack"
  ],
@ -445,13 +626,14 @@
    "list": []
  },
  "time": {
-    "from": "now-15m",
+    "from": "now-3h",
    "to": "now"
  },
  "timeRangeUpdatedDuringEditOrView": false,
  "timepicker": {},
  "timezone": "browser",
  "title": "Llama Stack Metrics",
  "uid": "llama-stack-metrics",
-  "version": 0,
+  "version": 17,
  "weekStart": ""
 }
--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@ -191,22 +191,6 @@ class DistributionSpec(BaseModel):
    )
 class TelemetryConfig(BaseModel):
    """
    Configuration for telemetry.
    Llama Stack uses OpenTelemetry for telemetry. Please refer to https://opentelemetry.io/docs/languages/sdk-configuration/
    for env variables to configure the OpenTelemetry SDK.
    Example:
    ```bash
    OTEL_SERVICE_NAME=llama-stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 uv run llama stack run starter
    ```
    """
    enabled: bool = Field(default=False, description="enable or disable telemetry")
 class OAuth2JWKSConfig(BaseModel):
    # The JWKS URI for collecting public keys
    uri: str
@ -527,8 +511,6 @@ can be instantiated multiple times (with different configs) if necessary.
    logging: LoggingConfig | None = Field(default=None, description="Configuration for Llama Stack Logging")
    telemetry: TelemetryConfig = Field(default_factory=TelemetryConfig, description="Configuration for telemetry")
    server: ServerConfig = Field(
        default_factory=ServerConfig,
        description="Configuration for the HTTP(S) server",
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -46,8 +46,6 @@ from llama_stack.core.request_headers import PROVIDER_DATA_VAR, request_provider
 from llama_stack.core.resolver import ProviderRegistry
 from llama_stack.core.server.routes import RouteImpls, find_matching_route, initialize_route_impls
 from llama_stack.core.stack import Stack, get_stack_run_config_from_distro, replace_env_vars
 from llama_stack.core.telemetry import Telemetry
 from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT, end_trace, setup_logger, start_trace
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
@ -204,13 +202,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        super().__init__()
        # Initialize logging from environment variables first
        setup_logging()
        # when using the library client, we should not log to console since many
        # of our logs are intended for server-side usage
        if sinks_from_env := os.environ.get("TELEMETRY_SINKS", None):
            current_sinks = sinks_from_env.strip().lower().split(",")
            os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
        if in_notebook():
            import nest_asyncio
@ -295,8 +286,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
            raise _e
        assert self.impls is not None
        if self.config.telemetry.enabled:
            setup_logger(Telemetry())
        if not os.environ.get("PYTEST_CURRENT_TEST"):
            console = Console()
@ -392,13 +381,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        body, field_names = self._handle_file_uploads(options, body)
        body = self._convert_body(matched_func, body, exclude_params=set(field_names))
-
+        result = await matched_func(**body)
        trace_path = webmethod.descriptive_name or route_path
        await start_trace(trace_path, {"__location__": "library_client"})
        try:
            result = await matched_func(**body)
        finally:
            await end_trace()
        # Handle FastAPI Response objects (e.g., from file content retrieval)
        if isinstance(result, FastAPIResponse):
@ -457,19 +440,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        # Prepare body for the function call (handles both Pydantic and traditional params)
        body = self._convert_body(func, body)
        trace_path = webmethod.descriptive_name or route_path
        await start_trace(trace_path, {"__location__": "library_client"})
        async def gen():
-            try:
+            async for chunk in await func(**body):
-                async for chunk in await func(**body):
+                data = json.dumps(convert_pydantic_to_json_value(chunk))
-                    data = json.dumps(convert_pydantic_to_json_value(chunk))
+                sse_event = f"data: {data}\n\n"
-                    sse_event = f"data: {data}\n\n"
+                yield sse_event.encode("utf-8")
                    yield sse_event.encode("utf-8")
            finally:
                await end_trace()
-        wrapped_gen = preserve_contexts_async_generator(gen(), [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR])
+        wrapped_gen = preserve_contexts_async_generator(gen(), [PROVIDER_DATA_VAR])
        mock_response = httpx.Response(
            status_code=httpx.codes.OK,
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -392,8 +392,6 @@ async def instantiate_provider(
        args = [config, deps]
        if "policy" in inspect.signature(getattr(module, method)).parameters:
            args.append(policy)
        if "telemetry_enabled" in inspect.signature(getattr(module, method)).parameters and run_config.telemetry:
            args.append(run_config.telemetry.enabled)
    fn = getattr(module, method)
    impl = await fn(*args)
@ -401,18 +399,6 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config
    # Apply tracing if telemetry is enabled and any base class has __marked_for_tracing__ marker
    if run_config.telemetry.enabled:
        traced_classes = [
            base for base in reversed(impl.__class__.__mro__) if getattr(base, "__marked_for_tracing__", False)
        ]
        if traced_classes:
            from llama_stack.core.telemetry.trace_protocol import trace_protocol
            for cls in traced_classes:
                trace_protocol(cls)
    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -85,8 +85,6 @@ async def get_auto_router_impl(
        )
        await inference_store.initialize()
        api_to_dep_impl["store"] = inference_store
        api_to_dep_impl["telemetry_enabled"] = run_config.telemetry.enabled
    elif api == Api.vector_io:
        api_to_dep_impl["vector_stores_config"] = run_config.vector_stores
    elif api == Api.safety:
--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -7,7 +7,6 @@
 import asyncio
 import time
 from collections.abc import AsyncIterator
 from datetime import UTC, datetime
 from typing import Annotated, Any
 from fastapi import Body
@ -15,11 +14,7 @@ from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatC
 from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
 from pydantic import TypeAdapter
 from llama_stack.core.telemetry.telemetry import MetricEvent
 from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
 from llama_stack_api import (
    HealthResponse,
@ -60,15 +55,10 @@ class InferenceRouter(Inference):
        self,
        routing_table: RoutingTable,
        store: InferenceStore | None = None,
        telemetry_enabled: bool = False,
    ) -> None:
        logger.debug("Initializing InferenceRouter")
        self.routing_table = routing_table
        self.telemetry_enabled = telemetry_enabled
        self.store = store
        if self.telemetry_enabled:
            self.tokenizer = Tokenizer.get_instance()
            self.formatter = ChatFormat(self.tokenizer)
    async def initialize(self) -> None:
        logger.debug("InferenceRouter.initialize")
@ -94,54 +84,6 @@ class InferenceRouter(Inference):
        )
        await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type)
    def _construct_metrics(
        self,
        prompt_tokens: int,
        completion_tokens: int,
        total_tokens: int,
        fully_qualified_model_id: str,
        provider_id: str,
    ) -> list[MetricEvent]:
        """Constructs a list of MetricEvent objects containing token usage metrics.
        Args:
            prompt_tokens: Number of tokens in the prompt
            completion_tokens: Number of tokens in the completion
            total_tokens: Total number of tokens used
            fully_qualified_model_id:
            provider_id: The provider identifier
        Returns:
            List of MetricEvent objects with token usage metrics
        """
        span = get_current_span()
        if span is None:
            logger.warning("No span found for token usage metrics")
            return []
        metrics = [
            ("prompt_tokens", prompt_tokens),
            ("completion_tokens", completion_tokens),
            ("total_tokens", total_tokens),
        ]
        metric_events = []
        for metric_name, value in metrics:
            metric_events.append(
                MetricEvent(
                    trace_id=span.trace_id,
                    span_id=span.span_id,
                    metric=metric_name,
                    value=value,
                    timestamp=datetime.now(UTC),
                    unit="tokens",
                    attributes={
                        "model_id": fully_qualified_model_id,
                        "provider_id": provider_id,
                    },
                )
            )
        return metric_events
    async def _get_model_provider(self, model_id: str, expected_model_type: str) -> tuple[Inference, str]:
        model = await self.routing_table.get_object_by_identifier("model", model_id)
        if model:
@ -186,26 +128,9 @@ class InferenceRouter(Inference):
        if params.stream:
            return await provider.openai_completion(params)
            # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
        response = await provider.openai_completion(params)
        response.model = request_model_id
        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
                total_tokens=response.usage.total_tokens,
                fully_qualified_model_id=request_model_id,
                provider_id=provider.__provider_id__,
            )
            for metric in metrics:
                enqueue_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
            )
        return response
    async def openai_chat_completion(
@ -254,20 +179,6 @@ class InferenceRouter(Inference):
        if self.store:
            asyncio.create_task(self.store.store_chat_completion(response, params.messages))
        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
                total_tokens=response.usage.total_tokens,
                fully_qualified_model_id=request_model_id,
                provider_id=provider.__provider_id__,
            )
            for metric in metrics:
                enqueue_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
            )
        return response
    async def openai_embeddings(
@ -411,18 +322,6 @@ class InferenceRouter(Inference):
                    for choice_data in choices_data.values():
                        completion_text += "".join(choice_data["content_parts"])
                    # Add metrics to the chunk
                    if self.telemetry_enabled and hasattr(chunk, "usage") and chunk.usage:
                        metrics = self._construct_metrics(
                            prompt_tokens=chunk.usage.prompt_tokens,
                            completion_tokens=chunk.usage.completion_tokens,
                            total_tokens=chunk.usage.total_tokens,
                            fully_qualified_model_id=fully_qualified_model_id,
                            provider_id=provider_id,
                        )
                        for metric in metrics:
                            enqueue_event(metric)
                yield chunk
        finally:
            # Store the final assembled completion
--- a/src/llama_stack/core/routers/safety.py
+++ b/src/llama_stack/core/routers/safety.py
@ -6,11 +6,15 @@
 from typing import Any
 from opentelemetry import trace
 from llama_stack.core.datatypes import SafetyConfig
 from llama_stack.log import get_logger
 from llama_stack.telemetry.helpers import safety_request_span_attributes, safety_span_name
 from llama_stack_api import ModerationObject, OpenAIMessageParam, RoutingTable, RunShieldResponse, Safety, Shield
 logger = get_logger(name=__name__, category="core::routers")
 tracer = trace.get_tracer(__name__)
 class SafetyRouter(Safety):
@ -51,13 +55,17 @@ class SafetyRouter(Safety):
        messages: list[OpenAIMessageParam],
        params: dict[str, Any] = None,
    ) -> RunShieldResponse:
-        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
+        with tracer.start_as_current_span(name=safety_span_name(shield_id)):
-        provider = await self.routing_table.get_provider_impl(shield_id)
+            logger.debug(f"SafetyRouter.run_shield: {shield_id}")
-        return await provider.run_shield(
+            provider = await self.routing_table.get_provider_impl(shield_id)
-            shield_id=shield_id,
+            response = await provider.run_shield(
-            messages=messages,
+                shield_id=shield_id,
-            params=params,
+                messages=messages,
-        )
+                params=params,
            )
            safety_request_span_attributes(shield_id, messages, response)
        return response
    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
        list_shields_response = await self.routing_table.list_shields()
--- a/src/llama_stack/core/server/server.py
+++ b/src/llama_stack/core/server/server.py
@ -51,8 +51,6 @@ from llama_stack.core.stack import (
    cast_image_name_to_string,
    replace_env_vars,
 )
 from llama_stack.core.telemetry import Telemetry
 from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT, setup_logger
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.core.utils.context import preserve_contexts_async_generator
@ -61,7 +59,6 @@ from llama_stack_api import Api, ConflictError, PaginatedResponse, ResourceNotFo
 from .auth import AuthenticationMiddleware
 from .quota import QuotaMiddleware
 from .tracing import TracingMiddleware
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -264,7 +261,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
            try:
                if is_streaming:
-                    context_vars = [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR]
+                    context_vars = [PROVIDER_DATA_VAR]
                    if test_context_var is not None:
                        context_vars.append(test_context_var)
                    gen = preserve_contexts_async_generator(sse_generator(func(**kwargs)), context_vars)
@ -442,9 +439,6 @@ def create_app() -> StackApp:
        if cors_config:
            app.add_middleware(CORSMiddleware, **cors_config.model_dump())
    if config.telemetry.enabled:
        setup_logger(Telemetry())
    # Load external APIs if configured
    external_apis = load_external_apis(config)
    all_routes = get_all_api_routes(external_apis)
@ -516,9 +510,6 @@ def create_app() -> StackApp:
    # Generic Exception handler should be last
    app.exception_handler(Exception)(global_exception_handler)
    if config.telemetry.enabled:
        app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)
    return app
--- a/src/llama_stack/core/server/tracing.py
+++ b/src/llama_stack/core/server/tracing.py
@ -1,80 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from aiohttp import hdrs
 from llama_stack.core.external import ExternalApiSpec
 from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
 from llama_stack.core.telemetry.tracing import end_trace, start_trace
 from llama_stack.log import get_logger
 logger = get_logger(name=__name__, category="core::server")
 class TracingMiddleware:
    def __init__(self, app, impls, external_apis: dict[str, ExternalApiSpec]):
        self.app = app
        self.impls = impls
        self.external_apis = external_apis
        # FastAPI built-in paths that should bypass custom routing
        self.fastapi_paths = ("/docs", "/redoc", "/openapi.json", "/favicon.ico", "/static")
    async def __call__(self, scope, receive, send):
        if scope.get("type") == "lifespan":
            return await self.app(scope, receive, send)
        path = scope.get("path", "")
        # Check if the path is a FastAPI built-in path
        if path.startswith(self.fastapi_paths):
            # Pass through to FastAPI's built-in handlers
            logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}")
            return await self.app(scope, receive, send)
        if not hasattr(self, "route_impls"):
            self.route_impls = initialize_route_impls(self.impls, self.external_apis)
        try:
            _, _, route_path, webmethod = find_matching_route(
                scope.get("method", hdrs.METH_GET), path, self.route_impls
            )
        except ValueError:
            # If no matching endpoint is found, pass through to FastAPI
            logger.debug(f"No matching route found for path: {path}, falling back to FastAPI")
            return await self.app(scope, receive, send)
        # Log deprecation warning if route is deprecated
        if getattr(webmethod, "deprecated", False):
            logger.warning(
                f"DEPRECATED ROUTE USED: {scope.get('method', 'GET')} {path} - "
                f"This route is deprecated and may be removed in a future version. "
                f"Please check the docs for the supported version."
            )
        trace_attributes = {"__location__": "server", "raw_path": path}
        # Extract W3C trace context headers and store as trace attributes
        headers = dict(scope.get("headers", []))
        traceparent = headers.get(b"traceparent", b"").decode()
        if traceparent:
            trace_attributes["traceparent"] = traceparent
        tracestate = headers.get(b"tracestate", b"").decode()
        if tracestate:
            trace_attributes["tracestate"] = tracestate
        trace_path = webmethod.descriptive_name or route_path
        trace_context = await start_trace(trace_path, trace_attributes)
        async def send_with_trace_id(message):
            if message["type"] == "http.response.start":
                headers = message.get("headers", [])
                headers.append([b"x-trace-id", str(trace_context.trace_id).encode()])
                message["headers"] = headers
            await send(message)
        try:
            return await self.app(scope, receive, send_with_trace_id)
        finally:
            await end_trace()
--- a/src/llama_stack/core/telemetry/init.py
+++ b/src/llama_stack/core/telemetry/init.py
@ -1,32 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .telemetry import Telemetry
 from .trace_protocol import serialize_value, trace_protocol
 from .tracing import (
    CURRENT_TRACE_CONTEXT,
    ROOT_SPAN_MARKERS,
    end_trace,
    enqueue_event,
    get_current_span,
    setup_logger,
    span,
    start_trace,
 )
 __all__ = [
    "Telemetry",
    "trace_protocol",
    "serialize_value",
    "CURRENT_TRACE_CONTEXT",
    "ROOT_SPAN_MARKERS",
    "end_trace",
    "enqueue_event",
    "get_current_span",
    "setup_logger",
    "span",
    "start_trace",
 ]
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -1,629 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 import threading
 from collections.abc import Mapping, Sequence
 from datetime import datetime
 from enum import Enum
 from typing import (
    Annotated,
    Any,
    Literal,
    cast,
 )
 from opentelemetry import metrics, trace
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.metrics import MeterProvider
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
 from pydantic import BaseModel, Field
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import Primitive
 from llama_stack_api import json_schema_type, register_schema
 ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
 # Type alias for OpenTelemetry attribute values (excludes None)
 AttributeValue = str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]
 Attributes = Mapping[str, AttributeValue]
@json_schema_type
 class SpanStatus(Enum):
    """The status of a span indicating whether it completed successfully or with an error.
    :cvar OK: Span completed successfully without errors
    :cvar ERROR: Span completed with an error or failure
    """
    OK = "ok"
    ERROR = "error"
@json_schema_type
 class Span(BaseModel):
    """A span representing a single operation within a trace.
    :param span_id: Unique identifier for the span
    :param trace_id: Unique identifier for the trace this span belongs to
    :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
    :param name: Human-readable name describing the operation this span represents
    :param start_time: Timestamp when the operation began
    :param end_time: (Optional) Timestamp when the operation finished, if completed
    :param attributes: (Optional) Key-value pairs containing additional metadata about the span
    """
    span_id: str
    trace_id: str
    parent_span_id: str | None = None
    name: str
    start_time: datetime
    end_time: datetime | None = None
    attributes: dict[str, Any] | None = Field(default_factory=lambda: {})
    def set_attribute(self, key: str, value: Any):
        if self.attributes is None:
            self.attributes = {}
        self.attributes[key] = value
@json_schema_type
 class Trace(BaseModel):
    """A trace representing the complete execution path of a request across multiple operations.
    :param trace_id: Unique identifier for the trace
    :param root_span_id: Unique identifier for the root span that started this trace
    :param start_time: Timestamp when the trace began
    :param end_time: (Optional) Timestamp when the trace finished, if completed
    """
    trace_id: str
    root_span_id: str
    start_time: datetime
    end_time: datetime | None = None
@json_schema_type
 class EventType(Enum):
    """The type of telemetry event being logged.
    :cvar UNSTRUCTURED_LOG: A simple log message with severity level
    :cvar STRUCTURED_LOG: A structured log event with typed payload data
    :cvar METRIC: A metric measurement with value and unit
    """
    UNSTRUCTURED_LOG = "unstructured_log"
    STRUCTURED_LOG = "structured_log"
    METRIC = "metric"
@json_schema_type
 class LogSeverity(Enum):
    """The severity level of a log message.
    :cvar VERBOSE: Detailed diagnostic information for troubleshooting
    :cvar DEBUG: Debug information useful during development
    :cvar INFO: General informational messages about normal operation
    :cvar WARN: Warning messages about potentially problematic situations
    :cvar ERROR: Error messages indicating failures that don't stop execution
    :cvar CRITICAL: Critical error messages indicating severe failures
    """
    VERBOSE = "verbose"
    DEBUG = "debug"
    INFO = "info"
    WARN = "warn"
    ERROR = "error"
    CRITICAL = "critical"
 class EventCommon(BaseModel):
    """Common fields shared by all telemetry events.
    :param trace_id: Unique identifier for the trace this event belongs to
    :param span_id: Unique identifier for the span this event belongs to
    :param timestamp: Timestamp when the event occurred
    :param attributes: (Optional) Key-value pairs containing additional metadata about the event
    """
    trace_id: str
    span_id: str
    timestamp: datetime
    attributes: dict[str, Primitive] | None = Field(default_factory=lambda: {})
@json_schema_type
 class UnstructuredLogEvent(EventCommon):
    """An unstructured log event containing a simple text message.
    :param type: Event type identifier set to UNSTRUCTURED_LOG
    :param message: The log message text
    :param severity: The severity level of the log message
    """
    type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG
    message: str
    severity: LogSeverity
@json_schema_type
 class MetricEvent(EventCommon):
    """A metric event containing a measured value.
    :param type: Event type identifier set to METRIC
    :param metric: The name of the metric being measured
    :param value: The numeric value of the metric measurement
    :param unit: The unit of measurement for the metric value
    """
    type: Literal[EventType.METRIC] = EventType.METRIC
    metric: str  # this would be an enum
    value: int | float
    unit: str
@json_schema_type
 class StructuredLogType(Enum):
    """The type of structured log event payload.
    :cvar SPAN_START: Event indicating the start of a new span
    :cvar SPAN_END: Event indicating the completion of a span
    """
    SPAN_START = "span_start"
    SPAN_END = "span_end"
@json_schema_type
 class SpanStartPayload(BaseModel):
    """Payload for a span start event.
    :param type: Payload type identifier set to SPAN_START
    :param name: Human-readable name describing the operation this span represents
    :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
    """
    type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START
    name: str
    parent_span_id: str | None = None
@json_schema_type
 class SpanEndPayload(BaseModel):
    """Payload for a span end event.
    :param type: Payload type identifier set to SPAN_END
    :param status: The final status of the span indicating success or failure
    """
    type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END
    status: SpanStatus
 StructuredLogPayload = Annotated[
    SpanStartPayload | SpanEndPayload,
    Field(discriminator="type"),
 ]
 register_schema(StructuredLogPayload, name="StructuredLogPayload")
@json_schema_type
 class StructuredLogEvent(EventCommon):
    """A structured log event containing typed payload data.
    :param type: Event type identifier set to STRUCTURED_LOG
    :param payload: The structured payload data for the log event
    """
    type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG
    payload: StructuredLogPayload
 Event = Annotated[
    UnstructuredLogEvent | MetricEvent | StructuredLogEvent,
    Field(discriminator="type"),
 ]
 register_schema(Event, name="Event")
@json_schema_type
 class EvalTrace(BaseModel):
    """A trace record for evaluation purposes.
    :param session_id: Unique identifier for the evaluation session
    :param step: The evaluation step or phase identifier
    :param input: The input data for the evaluation
    :param output: The actual output produced during evaluation
    :param expected_output: The expected output for comparison during evaluation
    """
    session_id: str
    step: str
    input: str
    output: str
    expected_output: str
@json_schema_type
 class SpanWithStatus(Span):
    """A span that includes status information.
    :param status: (Optional) The current status of the span
    """
    status: SpanStatus | None = None
@json_schema_type
 class QueryConditionOp(Enum):
    """Comparison operators for query conditions.
    :cvar EQ: Equal to comparison
    :cvar NE: Not equal to comparison
    :cvar GT: Greater than comparison
    :cvar LT: Less than comparison
    """
    EQ = "eq"
    NE = "ne"
    GT = "gt"
    LT = "lt"
@json_schema_type
 class QueryCondition(BaseModel):
    """A condition for filtering query results.
    :param key: The attribute key to filter on
    :param op: The comparison operator to apply
    :param value: The value to compare against
    """
    key: str
    op: QueryConditionOp
    value: Any
 class QueryTracesResponse(BaseModel):
    """Response containing a list of traces.
    :param data: List of traces matching the query criteria
    """
    data: list[Trace]
 class QuerySpansResponse(BaseModel):
    """Response containing a list of spans.
    :param data: List of spans matching the query criteria
    """
    data: list[Span]
 class QuerySpanTreeResponse(BaseModel):
    """Response containing a tree structure of spans.
    :param data: Dictionary mapping span IDs to spans with status information
    """
    data: dict[str, SpanWithStatus]
 class MetricQueryType(Enum):
    """The type of metric query to perform.
    :cvar RANGE: Query metrics over a time range
    :cvar INSTANT: Query metrics at a specific point in time
    """
    RANGE = "range"
    INSTANT = "instant"
 class MetricLabelOperator(Enum):
    """Operators for matching metric labels.
    :cvar EQUALS: Label value must equal the specified value
    :cvar NOT_EQUALS: Label value must not equal the specified value
    :cvar REGEX_MATCH: Label value must match the specified regular expression
    :cvar REGEX_NOT_MATCH: Label value must not match the specified regular expression
    """
    EQUALS = "="
    NOT_EQUALS = "!="
    REGEX_MATCH = "=~"
    REGEX_NOT_MATCH = "!~"
 class MetricLabelMatcher(BaseModel):
    """A matcher for filtering metrics by label values.
    :param name: The name of the label to match
    :param value: The value to match against
    :param operator: The comparison operator to use for matching
    """
    name: str
    value: str
    operator: MetricLabelOperator = MetricLabelOperator.EQUALS
@json_schema_type
 class MetricLabel(BaseModel):
    """A label associated with a metric.
    :param name: The name of the label
    :param value: The value of the label
    """
    name: str
    value: str
@json_schema_type
 class MetricDataPoint(BaseModel):
    """A single data point in a metric time series.
    :param timestamp: Unix timestamp when the metric value was recorded
    :param value: The numeric value of the metric at this timestamp
    """
    timestamp: int
    value: float
    unit: str
@json_schema_type
 class MetricSeries(BaseModel):
    """A time series of metric data points.
    :param metric: The name of the metric
    :param labels: List of labels associated with this metric series
    :param values: List of data points in chronological order
    """
    metric: str
    labels: list[MetricLabel]
    values: list[MetricDataPoint]
 class QueryMetricsResponse(BaseModel):
    """Response containing metric time series data.
    :param data: List of metric series matching the query criteria
    """
    data: list[MetricSeries]
 _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "active_spans": {},
    "counters": {},
    "gauges": {},
    "up_down_counters": {},
    "histograms": {},
 }
 _global_lock = threading.Lock()
 _TRACER_PROVIDER = None
 logger = get_logger(name=__name__, category="telemetry")
 def _clean_attributes(attrs: dict[str, Any] | None) -> Attributes | None:
    """Remove None values from attributes dict to match OpenTelemetry's expected type."""
    if attrs is None:
        return None
    return {k: v for k, v in attrs.items() if v is not None}
 def is_tracing_enabled(tracer):
    with tracer.start_as_current_span("check_tracing") as span:
        return span.is_recording()
 class Telemetry:
    def __init__(self) -> None:
        self.meter = None
        global _TRACER_PROVIDER
        # Initialize the correct span processor based on the provider state.
        # This is needed since once the span processor is set, it cannot be unset.
        # Recreating the telemetry adapter multiple times will result in duplicate span processors.
        # Since the library client can be recreated multiple times in a notebook,
        # the kernel will hold on to the span processor and cause duplicate spans to be written.
        if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
            if _TRACER_PROVIDER is None:
                provider = TracerProvider()
                trace.set_tracer_provider(provider)
                _TRACER_PROVIDER = provider
                # Use single OTLP endpoint for all telemetry signals
                # Let OpenTelemetry SDK handle endpoint construction automatically
                # The SDK will read OTEL_EXPORTER_OTLP_ENDPOINT and construct appropriate URLs
                # https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter
                span_exporter = OTLPSpanExporter()
                span_processor = BatchSpanProcessor(span_exporter)
                cast(TracerProvider, trace.get_tracer_provider()).add_span_processor(span_processor)
                metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
                metric_provider = MeterProvider(metric_readers=[metric_reader])
                metrics.set_meter_provider(metric_provider)
            self.is_otel_endpoint_set = True
        else:
            logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT is not set, skipping telemetry")
            self.is_otel_endpoint_set = False
        self.meter = metrics.get_meter(__name__)
        self._lock = _global_lock
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        if self.is_otel_endpoint_set:
            cast(TracerProvider, trace.get_tracer_provider()).force_flush()
    async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None:
        if isinstance(event, UnstructuredLogEvent):
            self._log_unstructured(event, ttl_seconds)
        elif isinstance(event, MetricEvent):
            self._log_metric(event)
        elif isinstance(event, StructuredLogEvent):
            self._log_structured(event, ttl_seconds)
        else:
            raise ValueError(f"Unknown event type: {event}")
    def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            # Use global storage instead of instance storage
            span_id = int(event.span_id, 16)
            span = _GLOBAL_STORAGE["active_spans"].get(span_id)
            if span:
                timestamp_ns = int(event.timestamp.timestamp() * 1e9)
                span.add_event(
                    name=event.type.value,
                    attributes={
                        "message": event.message,
                        "severity": event.severity.value,
                        "__ttl__": ttl_seconds,
                        **(event.attributes or {}),
                    },
                    timestamp=timestamp_ns,
                )
            else:
                print(f"Warning: No active span found for span_id {span_id}. Dropping event: {event}")
    def _get_or_create_counter(self, name: str, unit: str) -> metrics.Counter:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["counters"]:
            _GLOBAL_STORAGE["counters"][name] = self.meter.create_counter(
                name=name,
                unit=unit,
                description=f"Counter for {name}",
            )
        return cast(metrics.Counter, _GLOBAL_STORAGE["counters"][name])
    def _get_or_create_gauge(self, name: str, unit: str) -> metrics.ObservableGauge:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["gauges"]:
            _GLOBAL_STORAGE["gauges"][name] = self.meter.create_gauge(
                name=name,
                unit=unit,
                description=f"Gauge for {name}",
            )
        return cast(metrics.ObservableGauge, _GLOBAL_STORAGE["gauges"][name])
    def _get_or_create_histogram(self, name: str, unit: str) -> metrics.Histogram:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["histograms"]:
            _GLOBAL_STORAGE["histograms"][name] = self.meter.create_histogram(
                name=name,
                unit=unit,
                description=f"Histogram for {name}",
            )
        return cast(metrics.Histogram, _GLOBAL_STORAGE["histograms"][name])
    def _log_metric(self, event: MetricEvent) -> None:
        # Add metric as an event to the current span
        try:
            with self._lock:
                # Only try to add to span if we have a valid span_id
                if event.span_id:
                    try:
                        span_id = int(event.span_id, 16)
                        span = _GLOBAL_STORAGE["active_spans"].get(span_id)
                        if span:
                            timestamp_ns = int(event.timestamp.timestamp() * 1e9)
                            span.add_event(
                                name=f"metric.{event.metric}",
                                attributes={
                                    "value": event.value,
                                    "unit": event.unit,
                                    **(event.attributes or {}),
                                },
                                timestamp=timestamp_ns,
                            )
                    except (ValueError, KeyError):
                        # Invalid span_id or span not found, but we already logged to console above
                        pass
        except Exception:
            # Lock acquisition failed
            logger.debug("Failed to acquire lock to add metric to span")
        # Log to OpenTelemetry meter if available
        if self.meter is None:
            return
        # Use histograms for token-related metrics (per-request measurements)
        # Use counters for other cumulative metrics
        token_metrics = {"prompt_tokens", "completion_tokens", "total_tokens"}
        if event.metric in token_metrics:
            # Token metrics are per-request measurements, use histogram
            histogram = self._get_or_create_histogram(event.metric, event.unit)
            histogram.record(event.value, attributes=_clean_attributes(event.attributes))
        elif isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=_clean_attributes(event.attributes))
        elif isinstance(event.value, float):
            up_down_counter = self._get_or_create_up_down_counter(event.metric, event.unit)
            up_down_counter.add(event.value, attributes=_clean_attributes(event.attributes))
    def _get_or_create_up_down_counter(self, name: str, unit: str) -> metrics.UpDownCounter:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["up_down_counters"]:
            _GLOBAL_STORAGE["up_down_counters"][name] = self.meter.create_up_down_counter(
                name=name,
                unit=unit,
                description=f"UpDownCounter for {name}",
            )
        return cast(metrics.UpDownCounter, _GLOBAL_STORAGE["up_down_counters"][name])
    def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            span_id = int(event.span_id, 16)
            tracer = trace.get_tracer(__name__)
            if event.attributes is None:
                event.attributes = {}
            event.attributes["__ttl__"] = ttl_seconds
            # Extract these W3C trace context attributes so they are not written to
            # underlying storage, as we just need them to propagate the trace context.
            traceparent = event.attributes.pop("traceparent", None)
            tracestate = event.attributes.pop("tracestate", None)
            if traceparent:
                # If we have a traceparent header value, we're not the root span.
                for root_attribute in ROOT_SPAN_MARKERS:
                    event.attributes.pop(root_attribute, None)
            if isinstance(event.payload, SpanStartPayload):
                # Check if span already exists to prevent duplicates
                if span_id in _GLOBAL_STORAGE["active_spans"]:
                    return
                context = None
                if event.payload.parent_span_id:
                    parent_span_id = int(event.payload.parent_span_id, 16)
                    parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
                    if parent_span:
                        context = trace.set_span_in_context(parent_span)
                elif traceparent:
                    carrier = {
                        "traceparent": traceparent,
                        "tracestate": tracestate,
                    }
                    context = TraceContextTextMapPropagator().extract(carrier=carrier)
                span = tracer.start_span(
                    name=event.payload.name,
                    context=context,
                    attributes=_clean_attributes(event.attributes),
                )
                _GLOBAL_STORAGE["active_spans"][span_id] = span
            elif isinstance(event.payload, SpanEndPayload):
                span = _GLOBAL_STORAGE["active_spans"].get(span_id)  # type: ignore[assignment]
                if span:
                    if event.attributes:
                        cleaned_attrs = _clean_attributes(event.attributes)
                        if cleaned_attrs:
                            span.set_attributes(cleaned_attrs)
                    status = (
                        trace.Status(status_code=trace.StatusCode.OK)
                        if event.payload.status == SpanStatus.OK
                        else trace.Status(status_code=trace.StatusCode.ERROR)
                    )
                    span.set_status(status)
                    span.end()
                    _GLOBAL_STORAGE["active_spans"].pop(span_id, None)
            else:
                raise ValueError(f"Unknown structured log event: {event}")
--- a/src/llama_stack/core/telemetry/trace_protocol.py
+++ b/src/llama_stack/core/telemetry/trace_protocol.py
@ -1,154 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 import inspect
 import json
 from collections.abc import AsyncGenerator, Callable
 from functools import wraps
 from typing import Any, cast
 from pydantic import BaseModel
 from llama_stack.models.llama.datatypes import Primitive
 type JSONValue = Primitive | list["JSONValue"] | dict[str, "JSONValue"]
 def serialize_value(value: Any) -> str:
    return str(_prepare_for_json(value))
 def _prepare_for_json(value: Any) -> JSONValue:
    """Serialize a single value into JSON-compatible format."""
    if value is None:
        return ""
    elif isinstance(value, str | int | float | bool):
        return value
    elif hasattr(value, "_name_"):
        return cast(str, value._name_)
    elif isinstance(value, BaseModel):
        return cast(JSONValue, json.loads(value.model_dump_json()))
    elif isinstance(value, list | tuple | set):
        return [_prepare_for_json(item) for item in value]
    elif isinstance(value, dict):
        return {str(k): _prepare_for_json(v) for k, v in value.items()}
    else:
        try:
            json.dumps(value)
            return cast(JSONValue, value)
        except Exception:
            return str(value)
 def trace_protocol[T: type[Any]](cls: T) -> T:
    """
    A class decorator that automatically traces all methods in a protocol/base class
    and its inheriting classes.
    """
    def trace_method(method: Callable[..., Any]) -> Callable[..., Any]:
        is_async = asyncio.iscoroutinefunction(method)
        is_async_gen = inspect.isasyncgenfunction(method)
        def create_span_context(self: Any, *args: Any, **kwargs: Any) -> tuple[str, str, dict[str, Primitive]]:
            class_name = self.__class__.__name__
            method_name = method.__name__
            span_type = "async_generator" if is_async_gen else "async" if is_async else "sync"
            sig = inspect.signature(method)
            param_names = list(sig.parameters.keys())[1:]  # Skip 'self'
            combined_args: dict[str, str] = {}
            for i, arg in enumerate(args):
                param_name = param_names[i] if i < len(param_names) else f"position_{i + 1}"
                combined_args[param_name] = serialize_value(arg)
            for k, v in kwargs.items():
                combined_args[str(k)] = serialize_value(v)
            span_attributes: dict[str, Primitive] = {
                "__autotraced__": True,
                "__class__": class_name,
                "__method__": method_name,
                "__type__": span_type,
                "__args__": json.dumps(combined_args),
            }
            return class_name, method_name, span_attributes
        @wraps(method)
        async def async_gen_wrapper(self: Any, *args: Any, **kwargs: Any) -> AsyncGenerator[Any, None]:
            from llama_stack.core.telemetry import tracing
            class_name, method_name, span_attributes = create_span_context(self, *args, **kwargs)
            with tracing.span(f"{class_name}.{method_name}", span_attributes) as span:
                count = 0
                try:
                    async for item in method(self, *args, **kwargs):
                        yield item
                        count += 1
                finally:
                    span.set_attribute("chunk_count", count)
        @wraps(method)
        async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
            from llama_stack.core.telemetry import tracing
            class_name, method_name, span_attributes = create_span_context(self, *args, **kwargs)
            with tracing.span(f"{class_name}.{method_name}", span_attributes) as span:
                try:
                    result = await method(self, *args, **kwargs)
                    span.set_attribute("output", serialize_value(result))
                    return result
                except Exception as e:
                    span.set_attribute("error", str(e))
                    raise
        @wraps(method)
        def sync_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
            from llama_stack.core.telemetry import tracing
            class_name, method_name, span_attributes = create_span_context(self, *args, **kwargs)
            with tracing.span(f"{class_name}.{method_name}", span_attributes) as span:
                try:
                    result = method(self, *args, **kwargs)
                    span.set_attribute("output", serialize_value(result))
                    return result
                except Exception as e:
                    span.set_attribute("error", str(e))
                    raise
        if is_async_gen:
            return async_gen_wrapper
        elif is_async:
            return async_wrapper
        else:
            return sync_wrapper
    # Wrap methods on the class itself (for classes applied at runtime)
    # Skip if already wrapped (indicated by __wrapped__ attribute)
    for name, method in vars(cls).items():
        if inspect.isfunction(method) and not name.startswith("_"):
            if not hasattr(method, "__wrapped__"):
                wrapped = trace_method(method)
                setattr(cls, name, wrapped)  # noqa: B010
    # Also set up __init_subclass__ for future subclasses
    original_init_subclass = cast(Callable[..., Any] | None, getattr(cls, "__init_subclass__", None))
    def __init_subclass__(cls_child: type[Any], **kwargs: Any) -> None:  # noqa: N807
        if original_init_subclass:
            cast(Callable[..., None], original_init_subclass)(**kwargs)
        for name, method in vars(cls_child).items():
            if inspect.isfunction(method) and not name.startswith("_"):
                setattr(cls_child, name, trace_method(method))  # noqa: B010
    cls_any = cast(Any, cls)
    cls_any.__init_subclass__ = classmethod(__init_subclass__)
    return cls
--- a/src/llama_stack/core/telemetry/tracing.py
+++ b/src/llama_stack/core/telemetry/tracing.py
@ -1,388 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 import contextvars
 import logging  # allow-direct-logging
 import queue
 import secrets
 import sys
 import threading
 import time
 from collections.abc import Callable
 from datetime import UTC, datetime
 from functools import wraps
 from typing import Any, Self
 from llama_stack.core.telemetry.telemetry import (
    ROOT_SPAN_MARKERS,
    Event,
    LogSeverity,
    Span,
    SpanEndPayload,
    SpanStartPayload,
    SpanStatus,
    StructuredLogEvent,
    Telemetry,
    UnstructuredLogEvent,
 )
 from llama_stack.core.telemetry.trace_protocol import serialize_value
 from llama_stack.log import get_logger
 logger = get_logger(__name__, category="core")
 # Fallback logger that does NOT propagate to TelemetryHandler to avoid recursion
 _fallback_logger = logging.getLogger("llama_stack.telemetry.background")
 if not _fallback_logger.handlers:
    _fallback_logger.propagate = False
    _fallback_logger.setLevel(logging.ERROR)
    _fallback_handler = logging.StreamHandler(sys.stderr)
    _fallback_handler.setLevel(logging.ERROR)
    _fallback_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
    _fallback_logger.addHandler(_fallback_handler)
 INVALID_SPAN_ID = 0x0000000000000000
 INVALID_TRACE_ID = 0x00000000000000000000000000000000
 # The logical root span may not be visible to this process if a parent context
 # is passed in. The local root span is the first local span in a trace.
 LOCAL_ROOT_SPAN_MARKER = "__local_root_span__"
 def trace_id_to_str(trace_id: int) -> str:
    """Convenience trace ID formatting method
    Args:
        trace_id: Trace ID int
    Returns:
        The trace ID as 32-byte hexadecimal string
    """
    return format(trace_id, "032x")
 def span_id_to_str(span_id: int) -> str:
    """Convenience span ID formatting method
    Args:
        span_id: Span ID int
    Returns:
        The span ID as 16-byte hexadecimal string
    """
    return format(span_id, "016x")
 def generate_span_id() -> str:
    span_id = secrets.randbits(64)
    while span_id == INVALID_SPAN_ID:
        span_id = secrets.randbits(64)
    return span_id_to_str(span_id)
 def generate_trace_id() -> str:
    trace_id = secrets.randbits(128)
    while trace_id == INVALID_TRACE_ID:
        trace_id = secrets.randbits(128)
    return trace_id_to_str(trace_id)
 LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS = 60.0
 class BackgroundLogger:
    def __init__(self, api: Telemetry, capacity: int = 100000):
        self.api = api
        self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
        self.worker_thread = threading.Thread(target=self._worker, daemon=True)
        self.worker_thread.start()
        self._last_queue_full_log_time: float = 0.0
        self._dropped_since_last_notice: int = 0
    def log_event(self, event: Event) -> None:
        try:
            self.log_queue.put_nowait(event)
        except queue.Full:
            # Aggregate drops and emit at most once per interval via fallback logger
            self._dropped_since_last_notice += 1
            current_time = time.time()
            if current_time - self._last_queue_full_log_time >= LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS:
                _fallback_logger.error(
                    "Log queue is full; dropped %d events since last notice",
                    self._dropped_since_last_notice,
                )
                self._last_queue_full_log_time = current_time
                self._dropped_since_last_notice = 0
    def _worker(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self._process_logs())
    async def _process_logs(self):
        while True:
            try:
                event = self.log_queue.get()
                await self.api.log_event(event)
            except Exception:
                import traceback
                traceback.print_exc()
                print("Error processing log event")
            finally:
                self.log_queue.task_done()
    def __del__(self) -> None:
        self.log_queue.join()
 BACKGROUND_LOGGER: BackgroundLogger | None = None
 def enqueue_event(event: Event) -> None:
    """Enqueue a telemetry event to the background logger if available.
    This provides a non-blocking path for routers and other hot paths to
    submit telemetry without awaiting the Telemetry API, reducing contention
    with the main event loop.
    """
    global BACKGROUND_LOGGER
    if BACKGROUND_LOGGER is None:
        raise RuntimeError("Telemetry API not initialized")
    BACKGROUND_LOGGER.log_event(event)
 class TraceContext:
    def __init__(self, logger: BackgroundLogger, trace_id: str):
        self.logger = logger
        self.trace_id = trace_id
        self.spans: list[Span] = []
    def push_span(self, name: str, attributes: dict[str, Any] | None = None) -> Span:
        current_span = self.get_current_span()
        span = Span(
            span_id=generate_span_id(),
            trace_id=self.trace_id,
            name=name,
            start_time=datetime.now(UTC),
            parent_span_id=current_span.span_id if current_span else None,
            attributes=attributes,
        )
        self.logger.log_event(
            StructuredLogEvent(
                trace_id=span.trace_id,
                span_id=span.span_id,
                timestamp=span.start_time,
                attributes=span.attributes,
                payload=SpanStartPayload(
                    name=span.name,
                    parent_span_id=span.parent_span_id,
                ),
            )
        )
        self.spans.append(span)
        return span
    def pop_span(self, status: SpanStatus = SpanStatus.OK) -> None:
        span = self.spans.pop()
        if span is not None:
            self.logger.log_event(
                StructuredLogEvent(
                    trace_id=span.trace_id,
                    span_id=span.span_id,
                    timestamp=span.start_time,
                    attributes=span.attributes,
                    payload=SpanEndPayload(
                        status=status,
                    ),
                )
            )
    def get_current_span(self) -> Span | None:
        return self.spans[-1] if self.spans else None
 CURRENT_TRACE_CONTEXT: contextvars.ContextVar[TraceContext | None] = contextvars.ContextVar(
    "trace_context", default=None
 )
 def setup_logger(api: Telemetry, level: int = logging.INFO):
    global BACKGROUND_LOGGER
    if BACKGROUND_LOGGER is None:
        BACKGROUND_LOGGER = BackgroundLogger(api)
    root_logger = logging.getLogger()
    root_logger.setLevel(level)
    root_logger.addHandler(TelemetryHandler())
 async def start_trace(name: str, attributes: dict[str, Any] | None = None) -> TraceContext | None:
    global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
    if BACKGROUND_LOGGER is None:
        logger.debug("No Telemetry implementation set. Skipping trace initialization...")
        return None
    trace_id = generate_trace_id()
    context = TraceContext(BACKGROUND_LOGGER, trace_id)
    # Mark this span as the root for the trace for now. The processing of
    # traceparent context if supplied comes later and will result in the
    # ROOT_SPAN_MARKERS being removed. Also mark this is the 'local' root,
    # i.e. the root of the spans originating in this process as this is
    # needed to ensure that we insert this 'local' root span's id into
    # the trace record in sqlite store.
    attributes = dict.fromkeys(ROOT_SPAN_MARKERS, True) | {LOCAL_ROOT_SPAN_MARKER: True} | (attributes or {})
    context.push_span(name, attributes)
    CURRENT_TRACE_CONTEXT.set(context)
    return context
 async def end_trace(status: SpanStatus = SpanStatus.OK):
    global CURRENT_TRACE_CONTEXT
    context = CURRENT_TRACE_CONTEXT.get()
    if context is None:
        logger.debug("No trace context to end")
        return
    context.pop_span(status)
    CURRENT_TRACE_CONTEXT.set(None)
 def severity(levelname: str) -> LogSeverity:
    if levelname == "DEBUG":
        return LogSeverity.DEBUG
    elif levelname == "INFO":
        return LogSeverity.INFO
    elif levelname == "WARNING":
        return LogSeverity.WARN
    elif levelname == "ERROR":
        return LogSeverity.ERROR
    elif levelname == "CRITICAL":
        return LogSeverity.CRITICAL
    else:
        raise ValueError(f"Unknown log level: {levelname}")
 # TODO: ideally, the actual emitting should be done inside a separate daemon
 # process completely isolated from the server
 class TelemetryHandler(logging.Handler):
    def emit(self, record: logging.LogRecord) -> None:
        # horrendous hack to avoid logging from asyncio and getting into an infinite loop
        if record.module in ("asyncio", "selector_events"):
            return
        global CURRENT_TRACE_CONTEXT
        context = CURRENT_TRACE_CONTEXT.get()
        if context is None:
            return
        span = context.get_current_span()
        if span is None:
            return
        enqueue_event(
            UnstructuredLogEvent(
                trace_id=span.trace_id,
                span_id=span.span_id,
                timestamp=datetime.now(UTC),
                message=self.format(record),
                severity=severity(record.levelname),
            )
        )
    def close(self) -> None:
        pass
 class SpanContextManager:
    def __init__(self, name: str, attributes: dict[str, Any] | None = None):
        self.name = name
        self.attributes = attributes
        self.span: Span | None = None
    def __enter__(self) -> Self:
        global CURRENT_TRACE_CONTEXT
        context = CURRENT_TRACE_CONTEXT.get()
        if not context:
            logger.debug("No trace context to push span")
            return self
        self.span = context.push_span(self.name, self.attributes)
        return self
    def __exit__(self, exc_type, exc_value, traceback) -> None:
        global CURRENT_TRACE_CONTEXT
        context = CURRENT_TRACE_CONTEXT.get()
        if not context:
            logger.debug("No trace context to pop span")
            return
        context.pop_span()
    def set_attribute(self, key: str, value: Any) -> None:
        if self.span:
            if self.span.attributes is None:
                self.span.attributes = {}
            self.span.attributes[key] = serialize_value(value)
    async def __aenter__(self) -> Self:
        global CURRENT_TRACE_CONTEXT
        context = CURRENT_TRACE_CONTEXT.get()
        if not context:
            logger.debug("No trace context to push span")
            return self
        self.span = context.push_span(self.name, self.attributes)
        return self
    async def __aexit__(self, exc_type, exc_value, traceback) -> None:
        global CURRENT_TRACE_CONTEXT
        context = CURRENT_TRACE_CONTEXT.get()
        if not context:
            logger.debug("No trace context to pop span")
            return
        context.pop_span()
    def __call__(self, func: Callable[..., Any]) -> Callable[..., Any]:
        @wraps(func)
        def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
            with self:
                return func(*args, **kwargs)
        @wraps(func)
        async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
            async with self:
                return await func(*args, **kwargs)
        @wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            if asyncio.iscoroutinefunction(func):
                return async_wrapper(*args, **kwargs)
            else:
                return sync_wrapper(*args, **kwargs)
        return wrapper
 def span(name: str, attributes: dict[str, Any] | None = None) -> SpanContextManager:
    return SpanContextManager(name, attributes)
 def get_current_span() -> Span | None:
    global CURRENT_TRACE_CONTEXT
    if CURRENT_TRACE_CONTEXT is None:
        logger.debug("No trace context to get current span")
        return None
    context = CURRENT_TRACE_CONTEXT.get()
    if context:
        return context.get_current_span()
    return None
--- a/src/llama_stack/core/utils/context.py
+++ b/src/llama_stack/core/utils/context.py
@ -7,8 +7,6 @@
 from collections.abc import AsyncGenerator
 from contextvars import ContextVar
 from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT
 _MISSING = object()
@ -69,16 +67,12 @@ def preserve_contexts_async_generator[T](
            try:
                yield item
                # Update our tracked values with any changes made during this iteration
-                # Only for non-trace context vars - trace context must persist across yields
+                # This allows context changes to persist across generator iterations
                # to allow nested span tracking for telemetry
                for context_var in context_vars:
-                    if context_var is not CURRENT_TRACE_CONTEXT:
+                    initial_context_values[context_var.name] = context_var.get()
                        initial_context_values[context_var.name] = context_var.get()
            finally:
-                # Restore non-trace context vars after each yield to prevent leaks between requests
+                # Restore context vars after each yield to prevent leaks between requests
                # CURRENT_TRACE_CONTEXT is NOT restored here to preserve telemetry span stack
                for context_var in context_vars:
-                    if context_var is not CURRENT_TRACE_CONTEXT:
+                    _restore_context_var(context_var)
                        _restore_context_var(context_var)
    return wrapper()
--- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
@ -281,8 +281,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -272,8 +272,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
--- a/src/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/src/llama_stack/distributions/dell/run-with-safety.yaml
@ -140,5 +140,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/dell/run.yaml
+++ b/src/llama_stack/distributions/dell/run.yaml
@ -131,5 +131,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/src/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
@ -153,5 +153,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/meta-reference-gpu/run.yaml
+++ b/src/llama_stack/distributions/meta-reference-gpu/run.yaml
@ -138,5 +138,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -135,5 +135,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/nvidia/run.yaml
+++ b/src/llama_stack/distributions/nvidia/run.yaml
@ -114,5 +114,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/oci/run.yaml
+++ b/src/llama_stack/distributions/oci/run.yaml
@ -132,5 +132,3 @@ registered_resources:
    provider_id: tavily-search
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/open-benchmark/run.yaml
+++ b/src/llama_stack/distributions/open-benchmark/run.yaml
@ -251,5 +251,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/postgres-demo/run.yaml
+++ b/src/llama_stack/distributions/postgres-demo/run.yaml
@ -114,5 +114,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -284,8 +284,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -275,8 +275,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -281,8 +281,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -272,8 +272,6 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
 vector_stores:
  default_provider_id: faiss
  default_embedding_model:
--- a/src/llama_stack/distributions/template.py
+++ b/src/llama_stack/distributions/template.py
@ -24,7 +24,6 @@ from llama_stack.core.datatypes import (
    Provider,
    SafetyConfig,
    ShieldInput,
    TelemetryConfig,
    ToolGroupInput,
    VectorStoresConfig,
 )
@ -189,7 +188,6 @@ class RunConfigSettings(BaseModel):
    default_benchmarks: list[BenchmarkInput] | None = None
    vector_stores_config: VectorStoresConfig | None = None
    safety_config: SafetyConfig | None = None
    telemetry: TelemetryConfig = Field(default_factory=lambda: TelemetryConfig(enabled=True))
    storage_backends: dict[str, Any] | None = None
    storage_stores: dict[str, Any] | None = None
@ -289,7 +287,6 @@ class RunConfigSettings(BaseModel):
            "server": {
                "port": 8321,
            },
            "telemetry": self.telemetry.model_dump(exclude_none=True) if self.telemetry else None,
        }
        if self.vector_stores_config:
--- a/src/llama_stack/distributions/watsonx/run.yaml
+++ b/src/llama_stack/distributions/watsonx/run.yaml
@ -132,5 +132,3 @@ registered_resources:
    provider_id: rag-runtime
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@ -37,7 +37,6 @@ CATEGORIES = [
    "eval",
    "tools",
    "client",
    "telemetry",
    "openai",
    "openai_responses",
    "openai_conversations",
--- a/src/llama_stack/providers/inline/agents/meta_reference/init.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/init.py
@ -15,7 +15,6 @@ async def get_provider_impl(
    config: MetaReferenceAgentsImplConfig,
    deps: dict[Api, Any],
    policy: list[AccessRule],
    telemetry_enabled: bool = False,
 ):
    from .agents import MetaReferenceAgentsImpl
@ -29,7 +28,6 @@ async def get_provider_impl(
        deps[Api.conversations],
        deps[Api.prompts],
        deps[Api.files],
        telemetry_enabled,
        policy,
    )
    await impl.initialize()
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -50,7 +50,6 @@ class MetaReferenceAgentsImpl(Agents):
        prompts_api: Prompts,
        files_api: Files,
        policy: list[AccessRule],
        telemetry_enabled: bool = False,
    ):
        self.config = config
        self.inference_api = inference_api
@ -59,7 +58,6 @@ class MetaReferenceAgentsImpl(Agents):
        self.tool_runtime_api = tool_runtime_api
        self.tool_groups_api = tool_groups_api
        self.conversations_api = conversations_api
        self.telemetry_enabled = telemetry_enabled
        self.prompts_api = prompts_api
        self.files_api = files_api
        self.in_memory_store = InmemoryKVStoreImpl()
@ -111,6 +109,7 @@ class MetaReferenceAgentsImpl(Agents):
        max_infer_iters: int | None = 10,
        guardrails: list[ResponseGuardrail] | None = None,
        max_tool_calls: int | None = None,
        metadata: dict[str, str] | None = None,
    ) -> OpenAIResponseObject:
        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        result = await self.openai_responses_impl.create_openai_response(
@ -130,6 +129,7 @@ class MetaReferenceAgentsImpl(Agents):
            guardrails,
            parallel_tool_calls,
            max_tool_calls,
            metadata,
        )
        return result  # type: ignore[no-any-return]
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -336,6 +336,7 @@ class OpenAIResponsesImpl:
        guardrails: list[str | ResponseGuardrailSpec] | None = None,
        parallel_tool_calls: bool | None = None,
        max_tool_calls: int | None = None,
        metadata: dict[str, str] | None = None,
    ):
        stream = bool(stream)
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@ -390,6 +391,7 @@ class OpenAIResponsesImpl:
            guardrail_ids=guardrail_ids,
            parallel_tool_calls=parallel_tool_calls,
            max_tool_calls=max_tool_calls,
            metadata=metadata,
        )
        if stream:
@ -442,6 +444,7 @@ class OpenAIResponsesImpl:
        guardrail_ids: list[str] | None = None,
        parallel_tool_calls: bool | None = True,
        max_tool_calls: int | None = None,
        metadata: dict[str, str] | None = None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        # These should never be None when called from create_openai_response (which sets defaults)
        # but we assert here to help mypy understand the types
@ -490,6 +493,7 @@ class OpenAIResponsesImpl:
            guardrail_ids=guardrail_ids,
            instructions=instructions,
            max_tool_calls=max_tool_calls,
            metadata=metadata,
        )
        # Stream the response
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -8,7 +8,8 @@ import uuid
 from collections.abc import AsyncIterator
 from typing import Any
-from llama_stack.core.telemetry import tracing
+from opentelemetry import trace
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack_api import (
@ -79,6 +80,7 @@ from .utils import (
 )
 logger = get_logger(name=__name__, category="agents::meta_reference")
 tracer = trace.get_tracer(__name__)
 def convert_tooldef_to_chat_tool(tool_def):
@ -118,6 +120,7 @@ class StreamingResponseOrchestrator:
        prompt: OpenAIResponsePrompt | None = None,
        parallel_tool_calls: bool | None = None,
        max_tool_calls: int | None = None,
        metadata: dict[str, str] | None = None,
    ):
        self.inference_api = inference_api
        self.ctx = ctx
@ -135,6 +138,7 @@ class StreamingResponseOrchestrator:
        self.parallel_tool_calls = parallel_tool_calls
        # Max number of total calls to built-in tools that can be processed in a response
        self.max_tool_calls = max_tool_calls
        self.metadata = metadata
        self.sequence_number = 0
        # Store MCP tool mapping that gets built during tool processing
        self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
@ -162,6 +166,7 @@ class StreamingResponseOrchestrator:
            model=self.ctx.model,
            status="completed",
            output=[OpenAIResponseMessage(role="assistant", content=[refusal_content], type="message")],
            metadata=self.metadata,
        )
        return OpenAIResponseObjectStreamResponseCompleted(response=refusal_response)
@ -197,6 +202,7 @@ class StreamingResponseOrchestrator:
            prompt=self.prompt,
            parallel_tool_calls=self.parallel_tool_calls,
            max_tool_calls=self.max_tool_calls,
            metadata=self.metadata,
        )
    async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
@ -1106,8 +1112,10 @@ class StreamingResponseOrchestrator:
                "server_url": mcp_tool.server_url,
                "mcp_list_tools_id": list_id,
            }
-            # List MCP tools with authorization from tool config
+
-            async with tracing.span("list_mcp_tools", attributes):
+            # TODO: follow semantic conventions for Open Telemetry tool spans
            # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
            with tracer.start_as_current_span("list_mcp_tools", attributes=attributes):
                tool_defs = await list_mcp_tools(
                    endpoint=mcp_tool.server_url,
                    headers=mcp_tool.headers,
@ -1183,9 +1191,9 @@ class StreamingResponseOrchestrator:
        if mcp_server.require_approval == "never":
            return False
        if isinstance(mcp_server, ApprovalFilter):
-            if tool_name in mcp_server.always:
+            if mcp_server.always and tool_name in mcp_server.always:
                return True
-            if tool_name in mcp_server.never:
+            if mcp_server.never and tool_name in mcp_server.never:
                return False
        return True
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -9,7 +9,8 @@ import json
 from collections.abc import AsyncIterator
 from typing import Any
-from llama_stack.core.telemetry import tracing
+from opentelemetry import trace
 from llama_stack.log import get_logger
 from llama_stack_api import (
    ImageContentItem,
@ -42,6 +43,7 @@ from llama_stack_api import (
 from .types import ChatCompletionContext, ToolExecutionResult
 logger = get_logger(name=__name__, category="agents::meta_reference")
 tracer = trace.get_tracer(__name__)
 class ToolExecutor:
@ -296,8 +298,9 @@ class ToolExecutor:
                    "server_url": mcp_tool.server_url,
                    "tool_name": function_name,
                }
-                # Invoke MCP tool with authorization from tool config
+                # TODO: follow semantic conventions for Open Telemetry tool spans
-                async with tracing.span("invoke_mcp_tool", attributes):
+                # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
                with tracer.start_as_current_span("invoke_mcp_tool", attributes=attributes):
                    result = await invoke_mcp_tool(
                        endpoint=mcp_tool.server_url,
                        tool_name=function_name,
@ -318,7 +321,7 @@ class ToolExecutor:
                    # Use vector_stores.search API instead of knowledge_search tool
                    # to support filters and ranking_options
                    query = tool_kwargs.get("query", "")
-                    async with tracing.span("knowledge_search", {}):
+                    with tracer.start_as_current_span("knowledge_search"):
                        result = await self._execute_knowledge_search_via_vector_store(
                            query=query,
                            response_file_search_tool=response_file_search_tool,
@ -327,7 +330,9 @@ class ToolExecutor:
                attributes = {
                    "tool_name": function_name,
                }
-                async with tracing.span("invoke_tool", attributes):
+                # TODO: follow semantic conventions for Open Telemetry tool spans
                # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
                with tracer.start_as_current_span("invoke_tool", attributes=attributes):
                    result = await self.tool_runtime_api.invoke_tool(
                        tool_name=function_name,
                        kwargs=tool_kwargs,
--- a/src/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -6,7 +6,6 @@
 import asyncio
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
 from llama_stack_api import OpenAIMessageParam, Safety, SafetyViolation, ViolationLevel
@ -31,15 +30,12 @@ class ShieldRunnerMixin:
        self.output_shields = output_shields
    async def run_multiple_shields(self, messages: list[OpenAIMessageParam], identifiers: list[str]) -> None:
-        async def run_shield_with_span(identifier: str):
+        responses = await asyncio.gather(
-            async with tracing.span(f"run_shield_{identifier}"):
+            *[
-                return await self.safety_api.run_shield(
+                self.safety_api.run_shield(shield_id=identifier, messages=messages, params={})
-                    shield_id=identifier,
+                for identifier in identifiers
-                    messages=messages,
+            ]
-                    params={},
+        )
                )
        responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers])
        for identifier, response in zip(identifiers, responses, strict=False):
            if not response.violation:
                continue
--- a/src/llama_stack/providers/remote/files/s3/files.py
+++ b/src/llama_stack/providers/remote/files/s3/files.py
@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from __future__ import annotations
 import uuid
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Annotated, Any, cast
@ -39,7 +37,7 @@ from .config import S3FilesImplConfig
 # TODO: provider data for S3 credentials
-def _create_s3_client(config: S3FilesImplConfig) -> S3Client:
+def _create_s3_client(config: S3FilesImplConfig) -> "S3Client":
    try:
        s3_config = {
            "region_name": config.region,
@ -66,7 +64,7 @@ def _create_s3_client(config: S3FilesImplConfig) -> S3Client:
        raise RuntimeError(f"Failed to initialize S3 client: {e}") from e
-async def _create_bucket_if_not_exists(client: S3Client, config: S3FilesImplConfig) -> None:
+async def _create_bucket_if_not_exists(client: "S3Client", config: S3FilesImplConfig) -> None:
    try:
        client.head_bucket(Bucket=config.bucket_name)
    except ClientError as e:
@ -192,7 +190,7 @@ class S3FilesImpl(Files):
        pass
    @property
-    def client(self) -> S3Client:
+    def client(self) -> "S3Client":
        assert self._client is not None, "Provider not initialized"
        return self._client
--- a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -8,7 +8,6 @@ from collections.abc import AsyncIterator, Iterable
 from openai import AuthenticationError
 from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack_api import (
@ -84,7 +83,7 @@ class BedrockInferenceAdapter(OpenAIMixin):
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        """Override to enable streaming usage metrics and handle authentication errors."""
        # Enable streaming usage metrics when telemetry is active
-        if params.stream and get_current_span() is not None:
+        if params.stream:
            if params.stream_options is None:
                params.stream_options = {"include_usage": True}
            elif "include_usage" not in params.stream_options:
--- a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -10,7 +10,6 @@ from typing import Any
 import litellm
 import requests
 from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
@ -59,7 +58,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
        # Add usage tracking for streaming when telemetry is active
        stream_options = params.stream_options
-        if params.stream and get_current_span() is not None:
+        if params.stream:
            if stream_options is None:
                stream_options = {"include_usage": True}
            elif "include_usage" not in stream_options:
--- a/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -217,10 +217,9 @@ class LiteLLMOpenAIMixin(
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        # Add usage tracking for streaming when telemetry is active
        from llama_stack.core.telemetry.tracing import get_current_span
        stream_options = params.stream_options
-        if params.stream and get_current_span() is not None:
+        if params.stream:
            if stream_options is None:
                stream_options = {"include_usage": True}
            elif "include_usage" not in stream_options:
--- a/src/llama_stack/providers/utils/tools/mcp.py
+++ b/src/llama_stack/providers/utils/tools/mcp.py
@ -89,6 +89,7 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
                # sse_client and streamablehttp_client have different signatures, but both
                # are called the same way here, so we cast to Any to avoid type errors
                client = cast(Any, sse_client)
            async with client(endpoint, headers=headers) as client_streams:
                async with ClientSession(read_stream=client_streams[0], write_stream=client_streams[1]) as session:
                    await session.initialize()
--- a/src/llama_stack/telemetry/init.py
+++ b/src/llama_stack/telemetry/init.py
@ -0,0 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/telemetry/constants.py
+++ b/src/llama_stack/telemetry/constants.py
@ -0,0 +1,27 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 This file contains constants used for naming data captured for telemetry.
 This is used to ensure that the data captured for telemetry is consistent and can be used to
 identify and correlate data. If custom telemetry data is added to llama stack, please add
 constants for it here.
 """
 llama_stack_prefix = "llama_stack"
 # Safety Attributes
 RUN_SHIELD_OPERATION_NAME = "run_shield"
 SAFETY_REQUEST_PREFIX = f"{llama_stack_prefix}.safety.request"
 SAFETY_REQUEST_SHIELD_ID_ATTRIBUTE = f"{SAFETY_REQUEST_PREFIX}.shield_id"
 SAFETY_REQUEST_MESSAGES_ATTRIBUTE = f"{SAFETY_REQUEST_PREFIX}.messages"
 SAFETY_RESPONSE_PREFIX = f"{llama_stack_prefix}.safety.response"
 SAFETY_RESPONSE_METADATA_ATTRIBUTE = f"{SAFETY_RESPONSE_PREFIX}.metadata"
 SAFETY_RESPONSE_VIOLATION_LEVEL_ATTRIBUTE = f"{SAFETY_RESPONSE_PREFIX}.violation.level"
 SAFETY_RESPONSE_USER_MESSAGE_ATTRIBUTE = f"{SAFETY_RESPONSE_PREFIX}.violation.user_message"
--- a/src/llama_stack/telemetry/helpers.py
+++ b/src/llama_stack/telemetry/helpers.py
@ -0,0 +1,43 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 from opentelemetry import trace
 from llama_stack_api import OpenAIMessageParam, RunShieldResponse
 from .constants import (
    RUN_SHIELD_OPERATION_NAME,
    SAFETY_REQUEST_MESSAGES_ATTRIBUTE,
    SAFETY_REQUEST_SHIELD_ID_ATTRIBUTE,
    SAFETY_RESPONSE_METADATA_ATTRIBUTE,
    SAFETY_RESPONSE_USER_MESSAGE_ATTRIBUTE,
    SAFETY_RESPONSE_VIOLATION_LEVEL_ATTRIBUTE,
 )
 def safety_span_name(shield_id: str) -> str:
    return f"{RUN_SHIELD_OPERATION_NAME} {shield_id}"
 # TODO: Consider using Wrapt to automatically instrument code
 # This is the industry standard way to package automatically instrumentation in python.
 def safety_request_span_attributes(
    shield_id: str, messages: list[OpenAIMessageParam], response: RunShieldResponse
 ) -> None:
    span = trace.get_current_span()
    span.set_attribute(SAFETY_REQUEST_SHIELD_ID_ATTRIBUTE, shield_id)
    messages_json = json.dumps([msg.model_dump() for msg in messages])
    span.set_attribute(SAFETY_REQUEST_MESSAGES_ATTRIBUTE, messages_json)
    if response.violation:
        if response.violation.metadata:
            metadata_json = json.dumps(response.violation.metadata)
            span.set_attribute(SAFETY_RESPONSE_METADATA_ATTRIBUTE, metadata_json)
        if response.violation.user_message:
            span.set_attribute(SAFETY_RESPONSE_USER_MESSAGE_ATTRIBUTE, response.violation.user_message)
        span.set_attribute(SAFETY_RESPONSE_VIOLATION_LEVEL_ATTRIBUTE, response.violation.violation_level.value)
--- a/src/llama_stack_api/agents.py
+++ b/src/llama_stack_api/agents.py
@ -89,6 +89,7 @@ class Agents(Protocol):
            ),
        ] = None,
        max_tool_calls: int | None = None,
        metadata: dict[str, str] | None = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a model response.
@ -100,6 +101,7 @@ class Agents(Protocol):
        :param include: (Optional) Additional fields to include in the response.
        :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
        :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response.
        :param metadata: (Optional) Dictionary of metadata key-value pairs to attach to the response.
        :returns: An OpenAIResponseObject.
        """
        ...
--- a/src/llama_stack_api/common/tracing.py
+++ b/src/llama_stack_api/common/tracing.py
@ -1,22 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 def telemetry_traceable(cls):
    """
    Mark a protocol for automatic tracing when telemetry is enabled.
    This is a metadata-only decorator with no dependencies on core.
    Actual tracing is applied by core routers at runtime if telemetry is enabled.
    Usage:
        @runtime_checkable
        @telemetry_traceable
        class MyProtocol(Protocol):
            ...
    """
    cls.__marked_for_tracing__ = True
    return cls
--- a/src/llama_stack_api/conversations.py
+++ b/src/llama_stack_api/conversations.py
@ -9,7 +9,6 @@ from typing import Annotated, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.openai_responses import (
    OpenAIResponseInputFunctionToolCallOutput,
    OpenAIResponseMCPApprovalRequest,
@ -157,7 +156,6 @@ class ConversationItemDeletedResource(BaseModel):
@runtime_checkable
@telemetry_traceable
 class Conversations(Protocol):
    """Conversations
--- a/src/llama_stack_api/files.py
+++ b/src/llama_stack_api/files.py
@ -11,7 +11,6 @@ from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel, Field
 from llama_stack_api.common.responses import Order
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.schema_utils import json_schema_type, webmethod
 from llama_stack_api.version import LLAMA_STACK_API_V1
@ -102,7 +101,6 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable
@telemetry_traceable
 class Files(Protocol):
    """Files
--- a/src/llama_stack_api/inference.py
+++ b/src/llama_stack_api/inference.py
@ -22,7 +22,6 @@ from llama_stack_api.common.content_types import InterleavedContent
 from llama_stack_api.common.responses import (
    Order,
 )
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.models import Model
 from llama_stack_api.schema_utils import json_schema_type, register_schema, webmethod
 from llama_stack_api.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
@ -989,7 +988,6 @@ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):
@runtime_checkable
@telemetry_traceable
 class InferenceProvider(Protocol):
    """
    This protocol defines the interface that should be implemented by all inference providers.
--- a/src/llama_stack_api/models.py
+++ b/src/llama_stack_api/models.py
@ -9,7 +9,6 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.resource import Resource, ResourceType
 from llama_stack_api.schema_utils import json_schema_type, webmethod
 from llama_stack_api.version import LLAMA_STACK_API_V1
@ -106,7 +105,6 @@ class OpenAIListModelsResponse(BaseModel):
@runtime_checkable
@telemetry_traceable
 class Models(Protocol):
    async def list_models(self) -> ListModelsResponse:
        """List all models.
--- a/src/llama_stack_api/openai_responses.py
+++ b/src/llama_stack_api/openai_responses.py
@ -597,6 +597,7 @@ class OpenAIResponseObject(BaseModel):
    :param usage: (Optional) Token usage information for the response
    :param instructions: (Optional) System message inserted into the model's context
    :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response
    :param metadata: (Optional) Dictionary of metadata key-value pairs
    """
    created_at: int
@ -619,6 +620,7 @@ class OpenAIResponseObject(BaseModel):
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
    max_tool_calls: int | None = None
    metadata: dict[str, str] | None = None
@json_schema_type
--- a/src/llama_stack_api/prompts.py
+++ b/src/llama_stack_api/prompts.py
@ -10,7 +10,6 @@ from typing import Protocol, runtime_checkable
 from pydantic import BaseModel, Field, field_validator, model_validator
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.schema_utils import json_schema_type, webmethod
 from llama_stack_api.version import LLAMA_STACK_API_V1
@ -93,7 +92,6 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable
@telemetry_traceable
 class Prompts(Protocol):
    """Prompts
--- a/src/llama_stack_api/safety.py
+++ b/src/llama_stack_api/safety.py
@ -9,7 +9,6 @@ from typing import Any, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.inference import OpenAIMessageParam
 from llama_stack_api.schema_utils import json_schema_type, webmethod
 from llama_stack_api.shields import Shield
@ -94,7 +93,6 @@ class ShieldStore(Protocol):
@runtime_checkable
@telemetry_traceable
 class Safety(Protocol):
    """Safety
--- a/src/llama_stack_api/shields.py
+++ b/src/llama_stack_api/shields.py
@ -8,7 +8,6 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.resource import Resource, ResourceType
 from llama_stack_api.schema_utils import json_schema_type, webmethod
 from llama_stack_api.version import LLAMA_STACK_API_V1
@ -49,7 +48,6 @@ class ListShieldsResponse(BaseModel):
@runtime_checkable
@telemetry_traceable
 class Shields(Protocol):
    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
    async def list_shields(self) -> ListShieldsResponse:
--- a/src/llama_stack_api/tools.py
+++ b/src/llama_stack_api/tools.py
@ -11,7 +11,6 @@ from pydantic import BaseModel
 from typing_extensions import runtime_checkable
 from llama_stack_api.common.content_types import URL, InterleavedContent
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.resource import Resource, ResourceType
 from llama_stack_api.schema_utils import json_schema_type, webmethod
 from llama_stack_api.version import LLAMA_STACK_API_V1
@ -109,7 +108,6 @@ class ListToolDefsResponse(BaseModel):
@runtime_checkable
@telemetry_traceable
 class ToolGroups(Protocol):
    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    async def register_tool_group(
@ -128,7 +126,7 @@ class ToolGroups(Protocol):
        """
        ...
-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    async def get_tool_group(
        self,
        toolgroup_id: str,
@ -140,7 +138,7 @@ class ToolGroups(Protocol):
        """
        ...
-    @webmethod(route="/toolgroups", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/toolgroups", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    async def list_tool_groups(self) -> ListToolGroupsResponse:
        """List tool groups with optional provider.
@ -148,7 +146,7 @@ class ToolGroups(Protocol):
        """
        ...
-    @webmethod(route="/tools", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/tools", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolDefsResponse:
        """List tools with optional tool group.
@ -157,7 +155,7 @@ class ToolGroups(Protocol):
        """
        ...
-    @webmethod(route="/tools/{tool_name:path}", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/tools/{tool_name:path}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    async def get_tool(
        self,
        tool_name: str,
@ -191,12 +189,11 @@ class SpecialToolGroup(Enum):
@runtime_checkable
@telemetry_traceable
 class ToolRuntime(Protocol):
    tool_store: ToolStore | None = None
    # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
-    @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    async def list_runtime_tools(
        self,
        tool_group_id: str | None = None,
@ -212,7 +209,7 @@ class ToolRuntime(Protocol):
        """
        ...
-    @webmethod(route="/tool-runtime/invoke", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/tool-runtime/invoke", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    async def invoke_tool(
        self,
        tool_name: str,
--- a/src/llama_stack_api/vector_io.py
+++ b/src/llama_stack_api/vector_io.py
@ -13,7 +13,6 @@ from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from fastapi import Body, Query
 from pydantic import BaseModel, Field, field_validator
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.inference import InterleavedContent
 from llama_stack_api.schema_utils import json_schema_type, register_schema, webmethod
 from llama_stack_api.vector_stores import VectorStore
@ -572,7 +571,6 @@ class VectorStoreTable(Protocol):
@runtime_checkable
@telemetry_traceable
 class VectorIO(Protocol):
    vector_store_table: VectorStoreTable | None = None
--- a/tests/integration/inference/test_provider_data_routing.py
+++ b/tests/integration/inference/test_provider_data_routing.py
@ -17,7 +17,6 @@ from unittest.mock import AsyncMock, patch
 import pytest
 from llama_stack.core.library_client import LlamaStackAsLibraryClient
 from llama_stack.core.telemetry.telemetry import MetricEvent
 from llama_stack_api import (
    Api,
    OpenAIAssistantMessageParam,
@ -27,10 +26,6 @@ from llama_stack_api import (
 )
 class OpenAIChatCompletionWithMetrics(OpenAIChatCompletion):
    metrics: list[MetricEvent] | None = None
 def test_unregistered_model_routing_with_provider_data(client_with_models):
    """
    Test that a model can be routed using provider_id/model_id format
@ -72,7 +67,7 @@ def test_unregistered_model_routing_with_provider_data(client_with_models):
    # The inference router's routing_table.impls_by_provider_id should have anthropic
    # Let's patch the anthropic provider's openai_chat_completion method
    # to avoid making real API calls
-    mock_response = OpenAIChatCompletionWithMetrics(
+    mock_response = OpenAIChatCompletion(
        id="chatcmpl-test-123",
        created=1234567890,
        model="claude-3-5-sonnet-20241022",
--- a/tests/integration/telemetry/collectors/in_memory.py
+++ b/tests/integration/telemetry/collectors/in_memory.py
@ -15,11 +15,10 @@ from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 import llama_stack.core.telemetry.telemetry as telemetry_module
 from .base import BaseTelemetryCollector, MetricStub, SpanStub
 # TODO: Fix thi to work with Automatic Instrumentation
 class InMemoryTelemetryCollector(BaseTelemetryCollector):
    """In-memory telemetry collector for library-client tests.
@ -75,13 +74,10 @@ class InMemoryTelemetryManager:
        meter_provider = MeterProvider(metric_readers=[metric_reader])
        metrics.set_meter_provider(meter_provider)
        telemetry_module._TRACER_PROVIDER = tracer_provider
        self.collector = InMemoryTelemetryCollector(span_exporter, metric_reader)
        self._tracer_provider = tracer_provider
        self._meter_provider = meter_provider
    def shutdown(self) -> None:
        telemetry_module._TRACER_PROVIDER = None
        self._tracer_provider.shutdown()
        self._meter_provider.shutdown()
--- a/tests/integration/telemetry/conftest.py
+++ b/tests/integration/telemetry/conftest.py
@ -15,6 +15,7 @@ from tests.integration.fixtures.common import instantiate_llama_stack_client
 from tests.integration.telemetry.collectors import InMemoryTelemetryManager, OtlpHttpTestCollector
 # TODO: Fix this to work with Automatic Instrumentation
@pytest.fixture(scope="session")
 def telemetry_test_collector():
    stack_mode = os.environ.get("LLAMA_STACK_TEST_STACK_CONFIG_TYPE", "library_client")
@ -48,6 +49,7 @@ def telemetry_test_collector():
            manager.shutdown()
 # TODO: Fix this to work with Automatic Instrumentation
@pytest.fixture(scope="session")
 def llama_stack_client(telemetry_test_collector, request):
    """Ensure telemetry collector is ready before initializing the stack client."""
--- a/tests/unit/cli/test_stack_config.py
+++ b/tests/unit/cli/test_stack_config.py
@ -155,9 +155,6 @@ def old_config():
              provider_type: inline::meta-reference
              config: {{}}
        api_providers:
          telemetry:
            provider_type: noop
            config: {{}}
    """
    )
@ -181,7 +178,7 @@ def test_parse_and_maybe_upgrade_config_up_to_date(up_to_date_config):
 def test_parse_and_maybe_upgrade_config_old_format(old_config):
    result = parse_and_maybe_upgrade_config(old_config)
    assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION
-    assert all(api in result.providers for api in ["inference", "safety", "memory", "telemetry"])
+    assert all(api in result.providers for api in ["inference", "safety", "memory"])
    safety_provider = result.providers["safety"][0]
    assert safety_provider.provider_type == "inline::meta-reference"
    assert "llama_guard_shield" in safety_provider.config
--- a/tests/unit/providers/agents/meta_reference/test_safety_optional.py
+++ b/tests/unit/providers/agents/meta_reference/test_safety_optional.py
@ -83,7 +83,7 @@ class TestProviderInitialization:
            new_callable=AsyncMock,
        ):
            # Should not raise any exception
-            provider = await get_provider_impl(config, mock_deps, policy=[], telemetry_enabled=False)
+            provider = await get_provider_impl(config, mock_deps, policy=[])
            assert provider is not None
    async def test_initialization_without_safety_api(self, mock_persistence_config, mock_deps):
@ -97,7 +97,7 @@ class TestProviderInitialization:
            new_callable=AsyncMock,
        ):
            # Should not raise any exception
-            provider = await get_provider_impl(config, mock_deps, policy=[], telemetry_enabled=False)
+            provider = await get_provider_impl(config, mock_deps, policy=[])
            assert provider is not None
            assert provider.safety_api is None
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@ -364,23 +364,6 @@ def test_invalid_auth_header_format_oauth2(oauth2_client):
    assert "Invalid Authorization header format" in response.json()["error"]["message"]
 async def mock_jwks_response(*args, **kwargs):
    return MockResponse(
        200,
        {
            "keys": [
                {
                    "kid": "1234567890",
                    "kty": "oct",
                    "alg": "HS256",
                    "use": "sig",
                    "k": base64.b64encode(b"foobarbaz").decode(),
                }
            ]
        },
    )
@pytest.fixture
 def jwt_token_valid():
    import jwt
@ -421,28 +404,60 @@ def mock_jwks_urlopen():
        yield mock_urlopen
@pytest.fixture
 def mock_jwks_urlopen_with_auth_required():
    """Mock urllib.request.urlopen that requires Bearer token for JWKS requests."""
    with patch("urllib.request.urlopen") as mock_urlopen:
        def side_effect(request, **kwargs):
            # Check if Authorization header is present
            auth_header = request.headers.get("Authorization") if hasattr(request, "headers") else None
            if not auth_header or not auth_header.startswith("Bearer "):
                # Simulate 401 Unauthorized
                import urllib.error
                raise urllib.error.HTTPError(
                    url=request.full_url if hasattr(request, "full_url") else "",
                    code=401,
                    msg="Unauthorized",
                    hdrs={},
                    fp=None,
                )
            # Mock the JWKS response for PyJWKClient
            mock_response = Mock()
            mock_response.read.return_value = json.dumps(
                {
                    "keys": [
                        {
                            "kid": "1234567890",
                            "kty": "oct",
                            "alg": "HS256",
                            "use": "sig",
                            "k": base64.b64encode(b"foobarbaz").decode(),
                        }
                    ]
                }
            ).encode()
            return mock_response
        mock_urlopen.side_effect = side_effect
        yield mock_urlopen
 def test_valid_oauth2_authentication(oauth2_client, jwt_token_valid, mock_jwks_urlopen):
    response = oauth2_client.get("/test", headers={"Authorization": f"Bearer {jwt_token_valid}"})
    assert response.status_code == 200
    assert response.json() == {"message": "Authentication successful"}
-@patch("httpx.AsyncClient.get", new=mock_jwks_response)
+def test_invalid_oauth2_authentication(oauth2_client, invalid_token, mock_jwks_urlopen, suppress_auth_errors):
 def test_invalid_oauth2_authentication(oauth2_client, invalid_token, suppress_auth_errors):
    response = oauth2_client.get("/test", headers={"Authorization": f"Bearer {invalid_token}"})
    assert response.status_code == 401
    assert "Invalid JWT token" in response.json()["error"]["message"]
 async def mock_auth_jwks_response(*args, **kwargs):
    if "headers" not in kwargs or "Authorization" not in kwargs["headers"]:
        return MockResponse(401, {})
    authz = kwargs["headers"]["Authorization"]
    if authz != "Bearer my-jwks-token":
        return MockResponse(401, {})
    return await mock_jwks_response(args, kwargs)
@pytest.fixture
 def oauth2_app_with_jwks_token():
    app = FastAPI()
@ -472,8 +487,9 @@ def oauth2_client_with_jwks_token(oauth2_app_with_jwks_token):
    return TestClient(oauth2_app_with_jwks_token)
-@patch("httpx.AsyncClient.get", new=mock_auth_jwks_response)
+def test_oauth2_with_jwks_token_expected(
-def test_oauth2_with_jwks_token_expected(oauth2_client, jwt_token_valid, suppress_auth_errors):
+    oauth2_client, jwt_token_valid, mock_jwks_urlopen_with_auth_required, suppress_auth_errors
 ):
    response = oauth2_client.get("/test", headers={"Authorization": f"Bearer {jwt_token_valid}"})
    assert response.status_code == 401