Merge remote-tracking branch 'origin/main' into stores

2025-12-12 20:12:33 +00:00 · 2025-10-20 10:49:06 -07:00 · 2025-10-20 10:49:06 -07:00 · 490b212576
commit 490b212576
parent 11bd202f18 359df3a37c
89 changed files with 19353 additions and 8323 deletions
--- a/tests/integration/batches/conftest.py
+++ b/tests/integration/batches/conftest.py
@ -70,10 +70,15 @@ class BatchHelper:
    ):
        """Wait for a batch to reach a terminal status.

+        Uses exponential backoff polling strategy for efficient waiting:
+        - Starts with short intervals (0.1s) for fast batches (e.g., replay mode)
+        - Doubles interval each iteration up to a maximum
+        - Adapts automatically to both fast and slow batch processing
+
        Args:
            batch_id: The batch ID to monitor
            max_wait_time: Maximum time to wait in seconds (default: 60 seconds)
-            sleep_interval: Time to sleep between checks in seconds (default: 1/10th of max_wait_time, min 1s, max 15s)
+            sleep_interval: If provided, uses fixed interval instead of exponential backoff
            expected_statuses: Set of expected terminal statuses (default: {"completed"})
            timeout_action: Action on timeout - "fail" (pytest.fail) or "skip" (pytest.skip)

@ -84,10 +89,6 @@ class BatchHelper:
            pytest.Failed: If batch reaches an unexpected status or timeout_action is "fail"
            pytest.Skipped: If timeout_action is "skip" on timeout or unexpected status
        """
-        if sleep_interval is None:
-            # Default to 1/10th of max_wait_time, with min 1s and max 15s
-            sleep_interval = max(1, min(15, max_wait_time // 10))
-
        if expected_statuses is None:
            expected_statuses = {"completed"}

@ -95,6 +96,15 @@ class BatchHelper:
        unexpected_statuses = terminal_statuses - expected_statuses

        start_time = time.time()
+
+        # Use exponential backoff if no explicit sleep_interval provided
+        if sleep_interval is None:
+            current_interval = 0.1  # Start with 100ms
+            max_interval = 10.0  # Cap at 10 seconds
+        else:
+            current_interval = sleep_interval
+            max_interval = sleep_interval
+
        while time.time() - start_time < max_wait_time:
            current_batch = self.client.batches.retrieve(batch_id)

@ -107,7 +117,11 @@ class BatchHelper:
                else:
                    pytest.fail(error_msg)

-            time.sleep(sleep_interval)
+            time.sleep(current_interval)
+
+            # Exponential backoff: double the interval each time, up to max
+            if sleep_interval is None:
+                current_interval = min(current_interval * 2, max_interval)

        timeout_msg = f"Batch did not reach expected status {expected_statuses} within {max_wait_time} seconds"
        if timeout_action == "skip":
--- a/tests/integration/common/recordings/ab1a32474062bbad640ce43d02d6b61ed9f174c225597f0241cf120c47c7d2fa.json
+++ b/tests/integration/common/recordings/ab1a32474062bbad640ce43d02d6b61ed9f174c225597f0241cf120c47c7d2fa.json
@ -0,0 +1,506 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant"
+        },
+        {
+          "role": "user",
+          "content": "What is 2 + 2?"
+        },
+        {
+          "role": "assistant",
+          "content": "The answer to the equation 2 + 2 is 4."
+        },
+        {
+          "role": "user",
+          "content": "Tell me a short joke"
+        }
+      ],
+      "max_tokens": 0,
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": "Why",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " did",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " scare",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": "crow",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " win",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " an",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " award",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": "?\n\n",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": "Because",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " he",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " was",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " outstanding",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " his",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": " field",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": "!",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-ab1a32474062",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-826d44c3.json
+++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-826d44c3.json
@ -0,0 +1,88 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/models",
+    "headers": {},
+    "body": {},
+    "endpoint": "/v1/models",
+    "model": ""
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b-instruct-fp16",
+          "created": 1760453641,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "qwen3:4b",
+          "created": 1757615302,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "gpt-oss:latest",
+          "created": 1756395223,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "nomic-embed-text:latest",
+          "created": 1756318548,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b",
+          "created": 1755191039,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "all-minilm:l6-v2",
+          "created": 1753968177,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:1b",
+          "created": 1746124735,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:latest",
+          "created": 1746044170,
+          "object": "model",
+          "owned_by": "library"
+        }
+      }
+    ],
+    "is_streaming": false
+  }
+}
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -42,7 +42,9 @@ def pytest_sessionstart(session):

    # Set test stack config type for api_recorder test isolation
    stack_config = session.config.getoption("--stack-config", default=None)
-    if stack_config and (stack_config.startswith("server:") or stack_config.startswith("http")):
+    if stack_config and (
+        stack_config.startswith("server:") or stack_config.startswith("docker:") or stack_config.startswith("http")
+    ):
        os.environ["LLAMA_STACK_TEST_STACK_CONFIG_TYPE"] = "server"
        logger.info(f"Test stack config type: server (stack_config={stack_config})")
    else:
@ -139,7 +141,9 @@ def pytest_addoption(parser):
            a 'pointer' to the stack. this can be either be:
            (a) a template name like `starter`, or
            (b) a path to a run.yaml file, or
-            (c) an adhoc config spec, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`
+            (c) an adhoc config spec, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`, or
+            (d) a server config like `server:ci-tests`, or
+            (e) a docker config like `docker:ci-tests` (builds and runs container)
            """
        ),
    )
--- a/tests/integration/telemetry/conftest.py
+++ b/tests/integration/telemetry/conftest.py
@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Telemetry test configuration using OpenTelemetry SDK exporters.
+
+This conftest provides in-memory telemetry collection for library_client mode only.
+Tests using these fixtures should skip in server mode since the in-memory collector
+cannot access spans from a separate server process.
+"""
+
+from typing import Any
+
+import opentelemetry.metrics as otel_metrics
+import opentelemetry.trace as otel_trace
+import pytest
+from opentelemetry import metrics, trace
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import InMemoryMetricReader
+from opentelemetry.sdk.trace import ReadableSpan, TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+import llama_stack.providers.inline.telemetry.meta_reference.telemetry as telemetry_module
+from llama_stack.testing.api_recorder import patch_httpx_for_test_id
+from tests.integration.fixtures.common import instantiate_llama_stack_client
+
+
+class TestCollector:
+    def __init__(self, span_exp, metric_read):
+        assert span_exp and metric_read
+        self.span_exporter = span_exp
+        self.metric_reader = metric_read
+
+    def get_spans(self) -> tuple[ReadableSpan, ...]:
+        return self.span_exporter.get_finished_spans()
+
+    def get_metrics(self) -> Any | None:
+        metrics = self.metric_reader.get_metrics_data()
+        if metrics and metrics.resource_metrics:
+            return metrics.resource_metrics[0].scope_metrics[0].metrics
+        return None
+
+    def clear(self) -> None:
+        self.span_exporter.clear()
+        self.metric_reader.get_metrics_data()
+
+
+@pytest.fixture(scope="session")
+def _telemetry_providers():
+    """Set up in-memory OTEL providers before llama_stack_client initializes."""
+    # Reset set-once flags to allow re-initialization
+    if hasattr(otel_trace, "_TRACER_PROVIDER_SET_ONCE"):
+        otel_trace._TRACER_PROVIDER_SET_ONCE._done = False  # type: ignore
+    if hasattr(otel_metrics, "_METER_PROVIDER_SET_ONCE"):
+        otel_metrics._METER_PROVIDER_SET_ONCE._done = False  # type: ignore
+
+    # Create in-memory exporters/readers
+    span_exporter = InMemorySpanExporter()
+    tracer_provider = TracerProvider()
+    tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter))
+    trace.set_tracer_provider(tracer_provider)
+
+    metric_reader = InMemoryMetricReader()
+    meter_provider = MeterProvider(metric_readers=[metric_reader])
+    metrics.set_meter_provider(meter_provider)
+
+    # Set module-level provider so TelemetryAdapter uses our in-memory providers
+    telemetry_module._TRACER_PROVIDER = tracer_provider
+
+    yield (span_exporter, metric_reader, tracer_provider, meter_provider)
+
+    telemetry_module._TRACER_PROVIDER = None
+    tracer_provider.shutdown()
+    meter_provider.shutdown()
+
+
+@pytest.fixture(scope="session")
+def llama_stack_client(_telemetry_providers, request):
+    """Override llama_stack_client to ensure in-memory telemetry providers are used."""
+    patch_httpx_for_test_id()
+    client = instantiate_llama_stack_client(request.session)
+
+    return client
+
+
+@pytest.fixture
+def mock_otlp_collector(_telemetry_providers):
+    """Provides access to telemetry data and clears between tests."""
+    span_exporter, metric_reader, _, _ = _telemetry_providers
+    collector = TestCollector(span_exporter, metric_reader)
+    yield collector
+    collector.clear()
--- a/tests/integration/telemetry/recordings/0de60cd6a6ec3dbfc4a7601e77be8083caf34f49147ad1c25efae1de3f0b25e5.json
+++ b/tests/integration/telemetry/recordings/0de60cd6a6ec3dbfc4a7601e77be8083caf34f49147ad1c25efae1de3f0b25e5.json
@ -0,0 +1,57 @@
+{
+  "test_id": "tests/integration/telemetry/test_openai_telemetry.py::test_openai_completion_creates_telemetry[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test OpenAI telemetry creation"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-0de60cd6a6ec",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm happy to help you with setting up and testing OpenAI's telemetry creation.\n\nOpenAI provides a feature called \"Telemetry\" which allows developers to collect data about their users' interactions with the model. To test this feature, we need to create a simple application that uses the OpenAI API and sends telemetry data to their servers.\n\nHere's an example code in Python that demonstrates how to create a simple telemetry creator:\n\n```python\nimport os\nfrom openai.api import API\n\n# Initialize the OpenAI API client\napi = API(os.environ['OPENAI_API_KEY'])\n\ndef create_user():\n    # Create a new user entity\n    user_entity = {\n        'id': 'user-123',\n        'name': 'John Doe',\n        'email': 'john.doe@example.com'\n    }\n    \n    # Send the user creation request to OpenAI\n    response = api.users.create(user_entity)\n    print(f\"User created: {response}\")\n\ndef create_transaction():\n    # Create a new transaction entity\n    transaction_entity = {\n        'id': 'tran-123',\n        'user_id': 'user-123',\n        'transaction_type': 'query'\n    }\n    \n    # Send the transaction creation request to OpenAI\n    response = api.transactions.create(transaction_entity)\n    print(f\"Transaction created: {response}\")\n\ndef send_telemetry_data():\n    # Create a new telemetry event entity\n    telemetry_event_entity = {\n        'id': 'telem-123',\n        'transaction_id': 'tran-123',\n        'data': '{ \"event\": \"test\", \"user_id\": 1 }'\n    }\n    \n    # Send the telemetry data to OpenAI\n    response = api.telemetry.create(telemetry_event_entity)\n    print(f\"Telemetry event sent: {response}\")\n\n# Test the telemetry creation\ncreate_user()\ncreate_transaction()\nsend_telemetry_data()\n```\n\nMake sure you replace `OPENAI_API_KEY` with your actual API key. Also, ensure that you have the OpenAI API client library installed by running `pip install openai`.\n\nOnce you've created the test code, run it and observe the behavior of the telemetry creation process.\n\nPlease let me know if you need further modifications or assistance!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 460,
+          "prompt_tokens": 30,
+          "total_tokens": 490,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/telemetry/recordings/1fcfd86d8111374dc852cfdea6bfdb6a511f92cee84a6325b04ae84878512c30.json
+++ b/tests/integration/telemetry/recordings/1fcfd86d8111374dc852cfdea6bfdb6a511f92cee84a6325b04ae84878512c30.json
@ -0,0 +1,59 @@
+{
+  "test_id": "tests/integration/telemetry/test_completions.py::test_telemetry_format_completeness[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace openai with temperature 0.7"
+        }
+      ],
+      "max_tokens": 100,
+      "stream": false,
+      "temperature": 0.7
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-1fcfd86d8111",
+        "choices": [
+          {
+            "finish_reason": "length",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "import torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n\n# Load the pre-trained model and tokenizer\nmodel_name = \"CompVis/transformers-base-uncased\"\nmodel = AutoModelForCausalLM.from_pretrained(model_name)\ntokenizer = AutoTokenizer.from_pretrained(model_name)\n\n# Set the temperature to 0.7\ntemperature = 0.7\n\n# Define a function to generate text\ndef generate_text(prompt, max_length=100):\n    input",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 100,
+          "prompt_tokens": 35,
+          "total_tokens": 135,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/telemetry/recordings/d45c9a9229e7e3f50a6eac139508babe21988649eb321b562f74061f58593c25.json
+++ b/tests/integration/telemetry/recordings/d45c9a9229e7e3f50a6eac139508babe21988649eb321b562f74061f58593c25.json
--- a/tests/integration/telemetry/recordings/db8ffad4840512348c215005128557807ffbed0cf6bf11a52c1d1009878886ef.json
+++ b/tests/integration/telemetry/recordings/db8ffad4840512348c215005128557807ffbed0cf6bf11a52c1d1009878886ef.json
--- a/tests/integration/telemetry/recordings/dba5042d6691c2fbc29f2172c0f175e235f2829cb1c3e49781dd2b1850e28775.json
+++ b/tests/integration/telemetry/recordings/dba5042d6691c2fbc29f2172c0f175e235f2829cb1c3e49781dd2b1850e28775.json
@ -0,0 +1,59 @@
+{
+  "test_id": "tests/integration/telemetry/test_completions.py::test_telemetry_format_completeness[txt=llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace openai with temperature 0.7"
+        }
+      ],
+      "max_tokens": 100,
+      "stream": false,
+      "temperature": 0.7
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-dba5042d6691",
+        "choices": [
+          {
+            "finish_reason": "length",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "To test the \"trace\" functionality of OpenAI's GPT-4 model at a temperature of 0.7, you can follow these steps:\n\n1. First, make sure you have an account with OpenAI and have been granted access to their API.\n\n2. You will need to install the `transformers` library, which is the official library for working with Transformers models like GPT-4:\n\n   ```bash\npip install transformers\n```\n\n3. Next, import the necessary",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 100,
+          "prompt_tokens": 35,
+          "total_tokens": 135,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/telemetry/test_completions.py
+++ b/tests/integration/telemetry/test_completions.py
@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Telemetry tests verifying @trace_protocol decorator format using in-memory exporter."""
+
+import json
+import os
+
+import pytest
+
+pytestmark = pytest.mark.skipif(
+    os.environ.get("LLAMA_STACK_TEST_STACK_CONFIG_TYPE") == "server",
+    reason="In-memory telemetry tests only work in library_client mode (server mode runs in separate process)",
+)
+
+
+def test_streaming_chunk_count(mock_otlp_collector, llama_stack_client, text_model_id):
+    """Verify streaming adds chunk_count and __type__=async_generator."""
+
+    stream = llama_stack_client.chat.completions.create(
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Test trace openai 1"}],
+        stream=True,
+    )
+
+    chunks = list(stream)
+    assert len(chunks) > 0
+
+    spans = mock_otlp_collector.get_spans()
+    assert len(spans) > 0
+
+    chunk_count = None
+    for span in spans:
+        if span.attributes.get("__type__") == "async_generator":
+            chunk_count = span.attributes.get("chunk_count")
+            if chunk_count:
+                chunk_count = int(chunk_count)
+                break
+
+    assert chunk_count is not None
+    assert chunk_count == len(chunks)
+
+
+def test_telemetry_format_completeness(mock_otlp_collector, llama_stack_client, text_model_id):
+    """Comprehensive validation of telemetry data format including spans and metrics."""
+    response = llama_stack_client.chat.completions.create(
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Test trace openai with temperature 0.7"}],
+        temperature=0.7,
+        max_tokens=100,
+        stream=False,
+    )
+
+    # Handle both dict and Pydantic model for usage
+    # This occurs due to the replay system returning a dict for usage, but the client returning a Pydantic model
+    # TODO: Fix this by making the replay system return a Pydantic model for usage
+    usage = response.usage if isinstance(response.usage, dict) else response.usage.model_dump()
+    assert usage.get("prompt_tokens") and usage["prompt_tokens"] > 0
+    assert usage.get("completion_tokens") and usage["completion_tokens"] > 0
+    assert usage.get("total_tokens") and usage["total_tokens"] > 0
+
+    # Verify spans
+    spans = mock_otlp_collector.get_spans()
+    assert len(spans) == 5
+
+    # we only need this captured one time
+    logged_model_id = None
+
+    for span in spans:
+        attrs = span.attributes
+        assert attrs is not None
+
+        # Root span is created manually by tracing middleware, not by @trace_protocol decorator
+        is_root_span = attrs.get("__root__") is True
+
+        if is_root_span:
+            # Root spans have different attributes
+            assert attrs.get("__location__") in ["library_client", "server"]
+        else:
+            # Non-root spans are created by @trace_protocol decorator
+            assert attrs.get("__autotraced__")
+            assert attrs.get("__class__") and attrs.get("__method__")
+            assert attrs.get("__type__") in ["async", "sync", "async_generator"]
+
+            args = json.loads(attrs["__args__"])
+            if "model_id" in args:
+                logged_model_id = args["model_id"]
+
+    assert logged_model_id is not None
+    assert logged_model_id == text_model_id
+
+    # TODO: re-enable this once metrics get fixed
+    """
+    # Verify token usage metrics in response
+    metrics = mock_otlp_collector.get_metrics()
+
+    assert metrics
+    for metric in metrics:
+        assert metric.name in ["completion_tokens", "total_tokens", "prompt_tokens"]
+        assert metric.unit == "tokens"
+        assert metric.data.data_points and len(metric.data.data_points) == 1
+        match metric.name:
+            case "completion_tokens":
+                assert metric.data.data_points[0].value == usage["completion_tokens"]
+            case "total_tokens":
+                assert metric.data.data_points[0].value == usage["total_tokens"]
+            case "prompt_tokens":
+                assert metric.data.data_points[0].value == usage["prompt_tokens"
+    """
--- a/tests/unit/distribution/test_stack_list_deps.py
+++ b/tests/unit/distribution/test_stack_list_deps.py
@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+from io import StringIO
+from unittest.mock import patch
+
+from llama_stack.cli.stack._list_deps import (
+    run_stack_list_deps_command,
+)
+
+
+def test_stack_list_deps_basic():
+    args = argparse.Namespace(
+        config=None,
+        env_name="test-env",
+        providers="inference=remote::ollama",
+        format="deps-only",
+    )
+
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        run_stack_list_deps_command(args)
+        output = mock_stdout.getvalue()
+
+        # deps-only format should NOT include "uv pip install" or "Dependencies for"
+        assert "uv pip install" not in output
+        assert "Dependencies for" not in output
+
+        # Check that expected dependencies are present
+        assert "ollama" in output
+        assert "aiohttp" in output
+        assert "fastapi" in output
+
+
+def test_stack_list_deps_with_distro_uv():
+    args = argparse.Namespace(
+        config="starter",
+        env_name=None,
+        providers=None,
+        format="uv",
+    )
+
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        run_stack_list_deps_command(args)
+        output = mock_stdout.getvalue()
+
+        assert "uv pip install" in output