From 6824d23dc903ae16cfb4e4af299150f655a6133c Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 27 Feb 2025 22:35:52 -0500
Subject: [PATCH 01/22] test: Only run embedding tests for remote::nvidia
 (#1317)

This fixes release build failure
https://github.com/meta-llama/llama-stack-ops/actions/runs/13580302250/job/37964972403:

```
=================================== FAILURES ===================================
______ test_embedding_truncation_error[txt=8B:emb=MiniLM-long-text-None] _______
llama-stack/tests/client-sdk/inference/test_embedding.py:166: in test_embedding_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
______ test_embedding_truncation_error[txt=8B:emb=MiniLM-long-text-none] _______
llama-stack/tests/client-sdk/inference/test_embedding.py:166: in test_embedding_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
_______ test_embedding_truncation_error[txt=8B:emb=MiniLM-long-str-None] _______
llama-stack/tests/client-sdk/inference/test_embedding.py:166: in test_embedding_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
_______ test_embedding_truncation_error[txt=8B:emb=MiniLM-long-str-none] _______
llama-stack/tests/client-sdk/inference/test_embedding.py:166: in test_embedding_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
_________ test_embedding_text_truncation_error[txt=8B:emb=MiniLM-NONE] _________
llama-stack/tests/client-sdk/inference/test_embedding.py:223: in test_embedding_text_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
_________ test_embedding_text_truncation_error[txt=8B:emb=MiniLM-END] __________
llama-stack/tests/client-sdk/inference/test_embedding.py:223: in test_embedding_text_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
________ test_embedding_text_truncation_error[txt=8B:emb=MiniLM-START] _________
llama-stack/tests/client-sdk/inference/test_embedding.py:223: in test_embedding_text_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
_________ test_embedding_text_truncation_error[txt=8B:emb=MiniLM-left] _________
llama-stack/tests/client-sdk/inference/test_embedding.py:223: in test_embedding_text_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
________ test_embedding_text_truncation_error[txt=8B:emb=MiniLM-right] _________
llama-stack/tests/client-sdk/inference/test_embedding.py:223: in test_embedding_text_truncation_error
    with pytest.raises(BadRequestError) as excinfo:
E   Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
=========================== short test summary info ============================
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_truncation_error[txt=8B:emb=MiniLM-long-text-None] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_truncation_error[txt=8B:emb=MiniLM-long-text-none] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_truncation_error[txt=8B:emb=MiniLM-long-str-None] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_truncation_error[txt=8B:emb=MiniLM-long-str-none] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_text_truncation_error[txt=8B:emb=MiniLM-NONE] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_text_truncation_error[txt=8B:emb=MiniLM-END] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_text_truncation_error[txt=8B:emb=MiniLM-START] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_text_truncation_error[txt=8B:emb=MiniLM-left] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
FAILED llama-stack/tests/client-sdk/inference/test_embedding.py::test_embedding_text_truncation_error[txt=8B:emb=MiniLM-right] - Failed: DID NOT RAISE <class 'llama_stack_client.BadRequestError'>
= 9 failed, 48 passed, 2 skipped, 3 deselected, 3 xfailed, 1 xpassed, 121 warnings in 90.16s (0:01:30) =
Error: Process completed with exit code 1.
```

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 tests/client-sdk/inference/test_embedding.py | 39 ++++++++++++++++----
 1 file changed, 31 insertions(+), 8 deletions(-)
diff --git a/tests/client-sdk/inference/test_embedding.py b/tests/client-sdk/inference/test_embedding.py
index 46a901d62..c46a6517f 100644
--- a/tests/client-sdk/inference/test_embedding.py
+++ b/tests/client-sdk/inference/test_embedding.py
@@ -75,6 +75,7 @@ DUMMY_IMAGE_URL = ImageContentItem(
     image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image"
 )
 DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
+SUPPORTED_PROVIDERS = {"remote::nvidia"}
 
 
 @pytest.mark.parametrize(
@@ -88,7 +89,9 @@ DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64st
         "list[text]",
     ],
 )
-def test_embedding_text(llama_stack_client, embedding_model_id, contents):
+def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=contents)
     assert isinstance(response, EmbeddingsResponse)
     assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
@@ -108,7 +111,9 @@ def test_embedding_text(llama_stack_client, embedding_model_id, contents):
     ],
 )
 @pytest.mark.xfail(reason="Media is not supported")
-def test_embedding_image(llama_stack_client, embedding_model_id, contents):
+def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=contents)
     assert isinstance(response, EmbeddingsResponse)
     assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
@@ -134,7 +139,11 @@ def test_embedding_image(llama_stack_client, embedding_model_id, contents):
         "short",
     ],
 )
-def test_embedding_truncation(llama_stack_client, embedding_model_id, text_truncation, contents):
+def test_embedding_truncation(
+    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
+):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     response = llama_stack_client.inference.embeddings(
         model_id=embedding_model_id, contents=contents, text_truncation=text_truncation
     )
@@ -162,7 +171,11 @@ def test_embedding_truncation(llama_stack_client, embedding_model_id, text_trunc
         "long-str",
     ],
 )
-def test_embedding_truncation_error(llama_stack_client, embedding_model_id, text_truncation, contents):
+def test_embedding_truncation_error(
+    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
+):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     with pytest.raises(BadRequestError) as excinfo:
         llama_stack_client.inference.embeddings(
             model_id=embedding_model_id, contents=[DUMMY_LONG_TEXT], text_truncation=text_truncation
@@ -170,7 +183,9 @@ def test_embedding_truncation_error(llama_stack_client, embedding_model_id, text
 
 
 @pytest.mark.xfail(reason="Only valid for model supporting dimension reduction")
-def test_embedding_output_dimension(llama_stack_client, embedding_model_id):
+def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     base_response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=[DUMMY_STRING])
     test_response = llama_stack_client.inference.embeddings(
         model_id=embedding_model_id, contents=[DUMMY_STRING], output_dimension=32
@@ -180,7 +195,9 @@ def test_embedding_output_dimension(llama_stack_client, embedding_model_id):
 
 
 @pytest.mark.xfail(reason="Only valid for model supporting task type")
-def test_embedding_task_type(llama_stack_client, embedding_model_id):
+def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     query_embedding = llama_stack_client.inference.embeddings(
         model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
     )
@@ -199,7 +216,9 @@ def test_embedding_task_type(llama_stack_client, embedding_model_id):
         "start",
     ],
 )
-def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation):
+def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     response = llama_stack_client.inference.embeddings(
         model_id=embedding_model_id, contents=[DUMMY_STRING], text_truncation=text_truncation
     )
@@ -219,7 +238,11 @@ def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_
         "right",
     ],
 )
-def test_embedding_text_truncation_error(llama_stack_client, embedding_model_id, text_truncation):
+def test_embedding_text_truncation_error(
+    llama_stack_client, embedding_model_id, text_truncation, inference_provider_type
+):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     with pytest.raises(BadRequestError) as excinfo:
         llama_stack_client.inference.embeddings(
             model_id=embedding_model_id, contents=[DUMMY_STRING], text_truncation=text_truncation

From 7780fc92d593ca2fe783637c665403fdad84de19 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 27 Feb 2025 20:13:00 -0800
Subject: [PATCH 02/22] fix: update getting_started notebook to pass nbeval
 (#1318)

# What does this PR do?

- See
https://github.com/meta-llama/llama-stack-ops/actions/runs/13580825399/job/37966677766
- Together's structured decoding API is flaky, add skip to cell
- Enable cell 21 to pass cell 21-23

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan


<img width="652" alt="image"
src="https://github.com/user-attachments/assets/a1e4b94b-c1ce-4869-ba0d-0860bfe33460"
/>


[//]: # (## Documentation)
---
 docs/getting_started.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 8ae6fed24..21436327e 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -1145,6 +1145,7 @@
         }
       ],
       "source": [
+        "# NBVAL_SKIP\n",
         "from pydantic import BaseModel\n",
         "\n",
         "\n",
@@ -2885,7 +2886,6 @@
         }
       ],
       "source": [
-        "# NBVAL_SKIP\n",
         "from llama_stack_client.lib.agents.agent import Agent\n",
         "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
         "from llama_stack_client.types.agent_create_params import AgentConfig\n",
@@ -4326,7 +4326,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "toolchain",
+      "display_name": "master",
       "language": "python",
       "name": "python3"
     },

From 999195fe5b6416c092cbf8c5dabc2d221dad33f1 Mon Sep 17 00:00:00 2001
From: Hardik Shah <hjshah@meta.com>
Date: Thu, 27 Feb 2025 20:53:47 -0800
Subject: [PATCH 03/22] fix: [Litellm]Do not swallow first token (#1316)

`ChatCompletionResponseEventType: start` is ignored and not yielded in
the agent_instance as we expect that to not have any content.

However, litellm sends first event as `ChatCompletionResponseEventType:
start` with content ( which was the first token that we were skipping )

```
LLAMA_STACK_CONFIG=dev pytest -s -v tests/client-sdk/agents/test_agents.py --inference-model "openai/gpt-4o-mini" -k test_agent_simple
```
This was failing before ( since the word hello was not in the final
response )
---
 .../utils/inference/openai_compat.py          | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index eaf5ad2e1..d0fdf6385 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -6,7 +6,7 @@
 import json
 import logging
 import warnings
-from typing import AsyncGenerator, Dict, Generator, Iterable, List, Optional, Union
+from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union
 
 from openai import AsyncStream
 from openai.types.chat import (
@@ -841,14 +841,13 @@ async def convert_openai_chat_completion_stream(
     Convert a stream of OpenAI chat completion chunks into a stream
     of ChatCompletionResponseStreamChunk.
     """
-
-    # generate a stream of ChatCompletionResponseEventType: start -> progress -> progress -> ...
-    def _event_type_generator() -> Generator[ChatCompletionResponseEventType, None, None]:
-        yield ChatCompletionResponseEventType.start
-        while True:
-            yield ChatCompletionResponseEventType.progress
-
-    event_type = _event_type_generator()
+    yield ChatCompletionResponseStreamChunk(
+        event=ChatCompletionResponseEvent(
+            event_type=ChatCompletionResponseEventType.start,
+            delta=TextDelta(text=""),
+        )
+    )
+    event_type = ChatCompletionResponseEventType.progress
 
     stop_reason = None
     toolcall_buffer = {}
@@ -868,7 +867,7 @@ async def convert_openai_chat_completion_stream(
             if choice.delta.content:
                 yield ChatCompletionResponseStreamChunk(
                     event=ChatCompletionResponseEvent(
-                        event_type=next(event_type),
+                        event_type=event_type,
                         delta=TextDelta(text=choice.delta.content),
                         logprobs=_convert_openai_logprobs(logprobs),
                     )
@@ -909,7 +908,7 @@ async def convert_openai_chat_completion_stream(
                 toolcall_buffer["content"] += delta
                 yield ChatCompletionResponseStreamChunk(
                     event=ChatCompletionResponseEvent(
-                        event_type=next(event_type),
+                        event_type=event_type,
                         delta=ToolCallDelta(
                             tool_call=delta,
                             parse_status=ToolCallParseStatus.in_progress,
@@ -920,7 +919,7 @@ async def convert_openai_chat_completion_stream(
         else:
             yield ChatCompletionResponseStreamChunk(
                 event=ChatCompletionResponseEvent(
-                    event_type=next(event_type),
+                    event_type=event_type,
                     delta=TextDelta(text=choice.delta.content or ""),
                     logprobs=_convert_openai_logprobs(logprobs),
                 )
@@ -931,7 +930,7 @@ async def convert_openai_chat_completion_stream(
         toolcall_buffer["content"] += delta
         yield ChatCompletionResponseStreamChunk(
             event=ChatCompletionResponseEvent(
-                event_type=next(event_type),
+                event_type=event_type,
                 delta=ToolCallDelta(
                     tool_call=delta,
                     parse_status=ToolCallParseStatus.in_progress,

From 4c8a0fa8dc1fcb316de64f5ec71521192f6ff11f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Thu, 27 Feb 2025 22:49:06 -0800
Subject: [PATCH 04/22] fix: ensure ollama embedding model is registered
 properly in the template

---
 llama_stack/distribution/routers/routing_tables.py | 9 +--------
 llama_stack/templates/ollama/ollama.py             | 2 +-
 llama_stack/templates/ollama/run.yaml              | 6 ++++++
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index c2434e517..80e9ecb7c 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -318,14 +318,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
                 )
         model = await self.get_object_by_identifier("model", embedding_model)
         if model is None:
-            if embedding_model == "all-MiniLM-L6-v2":
-                raise ValueError(
-                    "Embeddings are now served via Inference providers. "
-                    "Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. "
-                    "See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example."
-                )
-            else:
-                raise ValueError(f"Model {embedding_model} not found")
+            raise ValueError(f"Model {embedding_model} not found")
         if model.model_type != ModelType.embedding:
             raise ValueError(f"Model {embedding_model} is not an embedding model")
         if "embedding_dimension" not in model.metadata:
diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py
index 83c7b1a63..3c24a41ba 100644
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@@ -93,7 +93,7 @@ def get_distribution_template() -> DistributionTemplate:
                     "inference": [inference_provider],
                     "vector_io": [vector_io_provider_sqlite],
                 },
-                default_models=[inference_model],
+                default_models=[inference_model, embedding_model],
                 default_tool_groups=default_tool_groups,
             ),
             "run-with-safety.yaml": RunConfigSettings(
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 0c82552c6..a2428688e 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -90,6 +90,12 @@ models:
   model_id: ${env.INFERENCE_MODEL}
   provider_id: ollama
   model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: ollama
+  provider_model_id: all-minilm:latest
+  model_type: embedding
 shields: []
 vector_dbs: []
 datasets: []

From ece354eeddc561910c1950294b3db92d7c2ab43f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Thu, 27 Feb 2025 22:54:34 -0800
Subject: [PATCH 05/22] test: dont hardcode faiss as provider in the tests
 please

---
 tests/client-sdk/agents/test_agents.py         | 1 -
 tests/client-sdk/tool_runtime/test_rag_tool.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py
index 6e3dc0739..8f68699b2 100644
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@@ -458,7 +458,6 @@ def test_rag_agent(llama_stack_client, agent_config, rag_tool_name):
         vector_db_id=vector_db_id,
         embedding_model="all-MiniLM-L6-v2",
         embedding_dimension=384,
-        provider_id="faiss",
     )
     llama_stack_client.tool_runtime.rag_tool.insert(
         documents=documents,
diff --git a/tests/client-sdk/tool_runtime/test_rag_tool.py b/tests/client-sdk/tool_runtime/test_rag_tool.py
index 40940f1ef..e330a10f5 100644
--- a/tests/client-sdk/tool_runtime/test_rag_tool.py
+++ b/tests/client-sdk/tool_runtime/test_rag_tool.py
@@ -24,7 +24,6 @@ def single_entry_vector_db_registry(llama_stack_client, empty_vector_db_registry
         vector_db_id=vector_db_id,
         embedding_model="all-MiniLM-L6-v2",
         embedding_dimension=384,
-        provider_id="faiss",
     )
     vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
     return vector_dbs
@@ -121,7 +120,6 @@ def test_vector_db_insert_from_url_and_query(llama_stack_client, empty_vector_db
         vector_db_id=vector_db_id,
         embedding_model="all-MiniLM-L6-v2",
         embedding_dimension=384,
-        provider_id="faiss",
     )
 
     # list to check memory bank is successfully registered

From caffafd101730850085821d89fa8ec5e4cd0fa02 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 27 Feb 2025 23:05:42 -0800
Subject: [PATCH 06/22] feat: update the default system prompt for 3.2/3.3
 models (#1310)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Summary:
The current prompt doesn't work well and tend to overindex on tool
calling. This PR is not perfect, but should be an improvement over the
current prompt. We can keep iterating.

# Test Plan:

Ran on a (small) eval with 20 HotpotQA examples.

With current prompt:
https://gist.github.com/ehhuang/9f967e62751907165eb13781ea968f5c
{
│ 'basic::equality': {'accuracy': {'accuracy': 0.2, 'num_correct': 4.0,
'num_total': 20}},
│   'F1ScoringFn': {
│   │   'f1_average': 0.25333333333333335,
│   │   'precision_average': 0.23301767676767676,
│   │   'recall_average': 0.375
│   }
}


num_tool_calls=[5, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 2, 2, 1, 1, 2, 1, 2,
2]
num_examples_with_tool_call=20
num_examples_with_pythontag=0


#########################################################
With new prompt:
https://gist.github.com/ehhuang/6e4a8ecf54db68922c2be8700056f962

{
│ 'basic::equality': {'accuracy': {'accuracy': 0.25, 'num_correct': 5.0,
'num_total': 20}},
│   'F1ScoringFn': {
│   │   'f1_average': 0.35579260478321006,
│   │   'precision_average': 0.32030238933180105,
│   │   'recall_average': 0.6091666666666666
│   }
}


num_tool_calls=[2, 1, 1, 5, 5, 5, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 3,
2]
num_examples_with_tool_call=20
num_examples_with_pythontag=0


The answers have higher recall, and make fewer tool calls. Note that
these were run with max_infer_iter=5, so the current prompt hits this
limit more often, and without the limit, someitmes goes into infinite
tool calling loop.

The data here is with 3.3-70B. Results are equally poor with either
prompt with 3.2-3B ~30 recall.
---
 .../models/llama/llama3/prompt_templates/system_prompts.py   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
index 27b1a3502..74a3ae4f0 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@@ -226,10 +226,9 @@ class FunctionTagCustomToolGenerator(PromptTemplateGeneratorBase):
 class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
     DEFAULT_PROMPT = textwrap.dedent(
         """
+        You are a helpful assistant. You have access to functions, but you should only use them if they are required.
         You are an expert in composing functions. You are given a question and a set of possible functions.
-        Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-        If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
-        also point it out. You should only return the function call in tools call sections.
+        Based on the question, you may or may not need to make one function/tool call to achieve the purpose.
 
         {{ function_description }}
         """.strip("\n")

From 8efa53daf1fb69b34fb7a0ba25bee38d8ffed335 Mon Sep 17 00:00:00 2001
From: Hardik Shah <hjshah@meta.com>
Date: Thu, 27 Feb 2025 23:06:37 -0800
Subject: [PATCH 07/22] fix: Agent telemetry inputs/outputs should be
 structured (#1302)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Original telemetry outputs for agent turns look like this.
Note: how output was a `str(message)` making it difficult to read them
back for downstream tasks ( eg. building eval datasets )
```
{
│   │   'input': [
│   │   │   '{"role":"system","content":"You are a helpful assistant. Use search tool to answer the questions. "}',
│   │   │   '{"role":"user","content":"Which teams played in the NBA western conference finals of 2024","context":null}'
│   │   ],
│   │   'output': "content:  tool_calls: [ToolCall(call_id='8b7294ec-a83f-4798-ad8f-6bed662f08b6', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'NBA Western Conference Finals 2024 teams'})]"
│   },
```

Updated the outputs to be structured .

## Test

```python
import uuid

from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types.agent_create_params import AgentConfig

model_id = "meta-llama/Llama-3.1-8B-Instruct"
agent_config = AgentConfig(
    model=model_id,
    instructions="You are a helpful assistant who will use the web search tools to help with answering questions.\nOnly provide final answer in short without writing full sentences. Use web search",
    toolgroups=["builtin::websearch"],
    enable_session_persistence=True,
)

agent = Agent(client, agent_config)

session_id = agent.create_session(uuid.uuid4().hex)
response = agent.create_turn(
    messages=[
        {
            "role": "user",
            "content": "latest news about llama stack",
        }
    ],
    session_id=session_id,
    stream=False,
)

pprint(response)
```
Output:
```
Turn(
│   input_messages=[UserMessage(content='latest news about llama stack', role='user', context=None)],
│   output_message=CompletionMessage(
│   │   content="The latest news about Llama Stack is that Meta has released Llama 3.2, which includes small and medium-sized vision LLMs (11B and 90B) and lightweight, text-only models (1B and 3B) that fit onto select edge and mobile devices. Additionally, Llama Stack distributions have been released to simplify the way developers work with Llama models in different environments. However, a critical vulnerability has been discovered in Meta's Llama-Stack, which puts AI applications at risk.",
│   │   role='assistant',
│   │   stop_reason='end_of_turn',
│   │   tool_calls=[]
│   ),
│   session_id='77379546-4598-485a-b4f4-84e5da28c513',
│   started_at=datetime.datetime(2025, 2, 27, 11, 2, 43, 915243, tzinfo=TzInfo(-08:00)),
│   steps=[
│   │   InferenceStep(
│   │   │   api_model_response=CompletionMessage(
│   │   │   │   content='',
│   │   │   │   role='assistant',
│   │   │   │   stop_reason='end_of_turn',
│   │   │   │   tool_calls=[
│   │   │   │   │   ToolCall(
│   │   │   │   │   │   arguments={'query': 'latest news llama stack'},
│   │   │   │   │   │   call_id='84c0fa10-e24a-4f91-a9ff-415a9ec0bb0b',
│   │   │   │   │   │   tool_name='brave_search'
│   │   │   │   │   )
│   │   │   │   ]
│   │   │   ),
│   │   │   step_id='81c16bd3-eb00-4721-8edc-f386e07391a3',
│   │   │   step_type='inference',
│   │   │   turn_id='2c6b5273-4b16-404f-bed2-c0025fd63b45',
│   │   │   completed_at=datetime.datetime(2025, 2, 27, 11, 2, 44, 637149, tzinfo=TzInfo(-08:00)),
│   │   │   started_at=datetime.datetime(2025, 2, 27, 11, 2, 43, 915831, tzinfo=TzInfo(-08:00))
│   │   ),
│   │   ToolExecutionStep(
│   │   │   step_id='4782d609-a62e-45f5-8d2a-25a43db46288',
│   │   │   step_type='tool_execution',
│   │   │   tool_calls=[
│   │   │   │   ToolCall(
│   │   │   │   │   arguments={'query': 'latest news llama stack'},
│   │   │   │   │   call_id='84c0fa10-e24a-4f91-a9ff-415a9ec0bb0b',
│   │   │   │   │   tool_name='brave_search'
│   │   │   │   )
│   │   │   ],
│   │   │   tool_responses=[
│   │   │   │   ToolResponse(
│   │   │   │   │   call_id='84c0fa10-e24a-4f91-a9ff-415a9ec0bb0b',
│   │   │   │   │   content='{"query": "latest news llama stack", "top_k": [{"title": "Llama 3.2: Revol. .......  Hacker News.", "score": 0.6186197, "raw_content": null}]}',
│   │   │   │   │   tool_name='brave_search',
│   │   │   │   │   metadata=None
│   │   │   │   )
│   │   │   ],
│   │   │   turn_id='2c6b5273-4b16-404f-bed2-c0025fd63b45',
│   │   │   completed_at=datetime.datetime(2025, 2, 27, 11, 2, 46, 272176, tzinfo=TzInfo(-08:00)),
│   │   │   started_at=datetime.datetime(2025, 2, 27, 11, 2, 44, 640743, tzinfo=TzInfo(-08:00))
│   │   ),
│   │   InferenceStep(
│   │   │   api_model_response=CompletionMessage(
│   │   │   │   content="The latest news about Llama Stack is that Meta has released Llama 3.2, which includes small and medium-sized vision LLMs (11B and 90B) and lightweight, text-only models (1B and 3B) that fit onto select edge and mobile devices. Additionally, Llama Stack distributions have been released to simplify the way developers work with Llama models in different environments. However, a critical vulnerability has been discovered in Meta's Llama-Stack, which puts AI applications at risk.",
│   │   │   │   role='assistant',
│   │   │   │   stop_reason='end_of_turn',
│   │   │   │   tool_calls=[]
│   │   │   ),
│   │   │   step_id='37994419-5da3-4e84-a010-8d9b85366262',
│   │   │   step_type='inference',
│   │   │   turn_id='2c6b5273-4b16-404f-bed2-c0025fd63b45',
│   │   │   completed_at=datetime.datetime(2025, 2, 27, 11, 2, 48, 961275, tzinfo=TzInfo(-08:00)),
│   │   │   started_at=datetime.datetime(2025, 2, 27, 11, 2, 46, 273168, tzinfo=TzInfo(-08:00))
│   │   )
│   ],
│   turn_id='2c6b5273-4b16-404f-bed2-c0025fd63b45',
│   completed_at=datetime.datetime(2025, 2, 27, 11, 2, 48, 962318, tzinfo=TzInfo(-08:00)),
│   output_attachments=[]
)

```

## Check for Telemetry
```python

agent_logs = []
for span in client.telemetry.query_spans(
    attribute_filters=[
      {"key": "session_id", "op": "eq", "value": session_id},
    ],
    attributes_to_return=['input', 'output'],
):
    agent_logs.append(span.attributes)

pprint(json.loads(agent_logs[-1]['output']))
```
```
{
│   'content': "The latest news about Llama Stack is that Meta has released Llama 3.2, which includes small and medium-sized vision LLMs (11B and 90B) and lightweight, text-only models (1B and 3B) that fit onto select edge and mobile devices. Additionally, Llama Stack distributions have been released to simplify the way developers work with Llama models in different environments. However, a critical vulnerability has been discovered in Meta's Llama-Stack, which puts AI applications at risk.",
│   'tool_calls': []
}
```
---
 .../agents/meta_reference/agent_instance.py   | 13 ++++++++++--
 .../utils/telemetry/trace_protocol.py         | 20 ++++++++++++++++---
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 5c492434f..2a93e7b3f 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -610,8 +610,17 @@ class ChatAgent(ShieldRunnerMixin):
                     if event.stop_reason is not None:
                         stop_reason = event.stop_reason
                 span.set_attribute("stop_reason", stop_reason)
-                span.set_attribute("input", [m.model_dump_json() for m in input_messages])
-                span.set_attribute("output", f"content: {content} tool_calls: {tool_calls}")
+                span.set_attribute(
+                    "input",
+                    json.dumps([json.loads(m.model_dump_json()) for m in input_messages]),
+                )
+                output_attr = json.dumps(
+                    {
+                        "content": content,
+                        "tool_calls": [json.loads(t.model_dump_json()) for t in tool_calls],
+                    }
+                )
+                span.set_attribute("output", output_attr)
 
             stop_reason = stop_reason or StopReason.out_of_tokens
 
diff --git a/llama_stack/providers/utils/telemetry/trace_protocol.py b/llama_stack/providers/utils/telemetry/trace_protocol.py
index 924274c42..525ade74d 100644
--- a/llama_stack/providers/utils/telemetry/trace_protocol.py
+++ b/llama_stack/providers/utils/telemetry/trace_protocol.py
@@ -6,6 +6,7 @@
 
 import asyncio
 import inspect
+import json
 from functools import wraps
 from typing import Any, AsyncGenerator, Callable, Type, TypeVar
 
@@ -17,6 +18,10 @@ T = TypeVar("T")
 
 
 def serialize_value(value: Any) -> Primitive:
+    return str(_prepare_for_json(value))
+
+
+def _prepare_for_json(value: Any) -> str:
     """Serialize a single value into JSON-compatible format."""
     if value is None:
         return ""
@@ -25,9 +30,17 @@ def serialize_value(value: Any) -> Primitive:
     elif hasattr(value, "_name_"):
         return value._name_
     elif isinstance(value, BaseModel):
-        return value.model_dump_json()
+        return json.loads(value.model_dump_json())
+    elif isinstance(value, (list, tuple, set)):
+        return [_prepare_for_json(item) for item in value]
+    elif isinstance(value, dict):
+        return {str(k): _prepare_for_json(v) for k, v in value.items()}
     else:
-        return str(value)
+        try:
+            json.dumps(value)
+            return value
+        except Exception:
+            return str(value)
 
 
 def trace_protocol(cls: Type[T]) -> Type[T]:
@@ -104,7 +117,8 @@ def trace_protocol(cls: Type[T]) -> Type[T]:
                     result = method(self, *args, **kwargs)
                     span.set_attribute("output", serialize_value(result))
                     return result
-                except Exception as _e:
+                except Exception as e:
+                    span.set_attribute("error", str(e))
                     raise
 
         if is_async_gen:

From 7f9b76727724c926b7efb50c523713cdb692d438 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Thu, 27 Feb 2025 23:07:23 -0800
Subject: [PATCH 08/22] fix: check conda env name using basepath in exec.py
 (#1301)

# What does this PR do?
check conda env name using basepath in exec.py
The current logic for finding conda prefix does a `endswith` check with
just the conda env name, but this will cause us to match incorrect if
there is a different conda env which ends with same suffix. In my case,
i had stack and llama-stack as the two conda envs.

## Test Plan
llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml
---
 llama_stack/distribution/utils/exec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/distribution/utils/exec.py b/llama_stack/distribution/utils/exec.py
index 82bf00e3c..aae6b35d8 100644
--- a/llama_stack/distribution/utils/exec.py
+++ b/llama_stack/distribution/utils/exec.py
@@ -46,7 +46,7 @@ def formulate_run_args(image_type, image_name, config, template_name) -> list:
             conda_env_info = json.loads(subprocess.check_output(["conda", "info", "--envs", "--json"]).decode())
             envs = conda_env_info["envs"]
             for envpath in envs:
-                if envpath.endswith(env_name):
+                if os.path.basename(envpath) == env_name:
                     return envpath
             return None
 

From 234408f411609a9463a954e0f5cbdff58fdc5b18 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 28 Feb 2025 12:18:02 -0500
Subject: [PATCH 09/22] docs: Add link to distributions guide in quick start
 guide (#1326)

# What does this PR do?

A couple of users have asked this question so I thought it would be a
good idea to add a link.
---
 docs/source/getting_started/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index ecef20d55..eb0dcf392 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -38,7 +38,7 @@ The API is **exactly identical** for both clients.
 :::{dropdown} Starting up the Llama Stack server
 The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.
 
-To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image.
+To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the configurations, please check out [this guide](../references/index.md).
 
 Lets setup some environment variables that we will use in the rest of the guide.
 ```bash

From 3b57d8ee882af8fdc5866acc0980ec46a4da9206 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 1 Mar 2025 01:27:22 +0800
Subject: [PATCH 10/22] feat: add prompt-format list (#1222)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]


https://github.com/meta-llama/llama-stack/blob/19ae4b35d9d22841ca14f30166d4b317554bd28d/llama_stack/cli/model/prompt_format.py#L47

Based on the comment: `Only Llama 3.1 and 3.2 are supported`, even 3.1,
3.2 are not all models can show it with `prompt-format`, so cannot refer
to `llama model list`,
only refer to list when enter a invalid model, so it would be nice to
help to check the valid models:
```
 llama model prompt-format -m Llama3.1-405B-Instruct:bf16-mp8
usage: llama model prompt-format [-h] [-m MODEL_NAME] [-l]
llama model prompt-format: error: Llama3.1-405B-Instruct:bf16-mp8 is not a valid Model <<<<---. Choose one from --
Llama3.1-8B
Llama3.1-70B
Llama3.1-405B
Llama3.1-8B-Instruct
Llama3.1-70B-Instruct
Llama3.1-405B-Instruct
Llama3.2-1B
Llama3.2-3B
Llama3.2-1B-Instruct
Llama3.2-3B-Instruct
Llama3.2-11B-Vision
Llama3.2-90B-Vision
Llama3.2-11B-Vision-Instruct
Llama3.2-90B-Vision-Instruct

before:
$ llama model prompt-format --help
usage: llama model prompt-format [-h] [-m MODEL_NAME]

Show llama model message formats

options:
  -h, --help            show this help message and exit
  -m MODEL_NAME, --model-name MODEL_NAME
                        Model Family (llama3_1, llama3_X, etc.)

Example:
    llama model prompt-format <options>


after:
$ llama model prompt-format --help
usage: llama model prompt-format [-h] [-m MODEL_NAME] [-l]

Show llama model message formats

options:
  -h, --help            show this help message and exit
  -m MODEL_NAME, --model-name MODEL_NAME
                        Model Family (llama3_1, llama3_X, etc.)
  -l, --list            List the valid supported models

Example:
    llama model prompt-format <options>

$ llama model prompt-format -l
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Model                        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ Llama3.1-8B                  │
├──────────────────────────────┤
│ Llama3.1-70B                 │
├──────────────────────────────┤
│ Llama3.1-405B                │
├──────────────────────────────┤
│ Llama3.1-8B-Instruct         │
├──────────────────────────────┤
│ Llama3.1-70B-Instruct        │
├──────────────────────────────┤
│ Llama3.1-405B-Instruct       │
├──────────────────────────────┤
│ Llama3.2-1B                  │
├──────────────────────────────┤
│ Llama3.2-3B                  │
├──────────────────────────────┤
│ Llama3.2-1B-Instruct         │
├──────────────────────────────┤
│ Llama3.2-3B-Instruct         │
├──────────────────────────────┤
│ Llama3.2-11B-Vision          │
├──────────────────────────────┤
│ Llama3.2-90B-Vision          │
├──────────────────────────────┤
│ Llama3.2-11B-Vision-Instruct │
├──────────────────────────────┤
│ Llama3.2-90B-Vision-Instruct │
└──────────────────────────────┘
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

---------

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 llama_stack/cli/model/prompt_format.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py
index ea9596ba5..516c67634 100644
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@@ -9,6 +9,7 @@ import textwrap
 from io import StringIO
 
 from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
 from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
 
 
@@ -48,7 +49,26 @@ class ModelPromptFormat(Subcommand):
         supported_model_ids = [
             m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
         ]
-        model_str = "\n".join([m.value for m in supported_model_ids])
+
+        model_list = [m.value for m in supported_model_ids]
+        model_str = "\n".join(model_list)
+
+        if args.list:
+            headers = ["Model(s)"]
+            rows = []
+            for m in model_list:
+                rows.append(
+                    [
+                        m,
+                    ]
+                )
+            print_table(
+                rows,
+                headers,
+                separate_rows=True,
+            )
+            return
+
         try:
             model_id = CoreModelId(args.model_name)
         except ValueError:

From 6fa257b475d4580c7337130beb5423974282aa2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 28 Feb 2025 18:36:49 +0100
Subject: [PATCH 11/22] chore(lint): update Ruff ignores for project
 conventions and maintainability (#1184)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added new ignores from flake8-bugbear (`B007`, `B008`)
- Ignored `C901` (high function complexity) for now, pending review
- Maintained PyTorch conventions (`N812`, `N817`)
- Allowed `E731` (lambda assignments) for flexibility
- Consolidated existing ignores (`E402`, `E501`, `F405`, `C408`, `N812`)
- Documented rationale for each ignored rule

This keeps our linting aligned with project needs while tracking
potential fixes.

Signed-off-by: Sébastien Han <seb@redhat.com>

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/cli/stack/_build.py               |  2 +-
 llama_stack/cli/tests/test_stack_config.py    |  2 +-
 llama_stack/distribution/distribution.py      |  4 +-
 llama_stack/distribution/resolver.py          |  4 +-
 .../distribution/ui/page/playground/rag.py    |  2 +-
 .../agents/meta_reference/agent_instance.py   |  6 +-
 .../inline/eval/meta_reference/eval.py        |  1 -
 .../inference/meta_reference/inference.py     |  1 -
 .../meta_reference/parallel_utils.py          |  4 +-
 .../quantization/fp8_txest_disabled.py        |  3 +
 .../recipes/lora_finetuning_single_device.py  |  2 +-
 .../inline/scoring/braintrust/braintrust.py   |  2 +-
 .../remote/inference/nvidia/nvidia.py         |  2 +-
 .../remote/inference/nvidia/openai_utils.py   |  4 +-
 llama_stack/providers/tests/eval/test_eval.py |  9 +-
 llama_stack/providers/tests/report.py         | 90 +++++++++----------
 .../providers/tests/scoring/test_scoring.py   |  6 --
 .../utils/inference/openai_compat.py          |  6 +-
 .../providers/utils/kvstore/redis/redis.py    |  2 +-
 .../utils/scoring/aggregation_utils.py        |  2 +-
 .../utils/scoring/base_scoring_fn.py          |  2 +-
 llama_stack/scripts/distro_codegen.py         |  2 +-
 pyproject.toml                                | 39 ++------
 tests/client-sdk/__init__.py                  |  1 +
 tests/client-sdk/agents/__init__.py           |  1 +
 tests/client-sdk/conftest.py                  |  2 +-
 tests/client-sdk/inference/__init__.py        |  1 +
 tests/client-sdk/inference/test_embedding.py  |  4 +-
 .../inference/test_text_inference.py          |  4 +-
 tests/client-sdk/report.py                    | 44 +++++----
 tests/client-sdk/safety/__init__.py           |  1 +
 tests/client-sdk/safety/test_safety.py        |  2 +-
 tests/client-sdk/vector_io/__init__.py        |  1 +
 33 files changed, 113 insertions(+), 145 deletions(-)

diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index 89db368db..baa7d2e32 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -141,7 +141,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 completer=WordCompleter(available_providers),
                 complete_while_typing=True,
                 validator=Validator.from_callable(
-                    lambda x: x in available_providers,
+                    lambda x: x in available_providers,  # noqa: B023 - see https://github.com/astral-sh/ruff/issues/7847
                     error_message="Invalid provider, use <TAB> to see options",
                 ),
             )
diff --git a/llama_stack/cli/tests/test_stack_config.py b/llama_stack/cli/tests/test_stack_config.py
index 2b7b2b210..333f86e38 100644
--- a/llama_stack/cli/tests/test_stack_config.py
+++ b/llama_stack/cli/tests/test_stack_config.py
@@ -112,7 +112,7 @@ def test_parse_and_maybe_upgrade_config_old_format(old_config):
 
     inference_providers = result.providers["inference"]
     assert len(inference_providers) == 2
-    assert set(x.provider_id for x in inference_providers) == {
+    assert {x.provider_id for x in inference_providers} == {
         "remote::ollama-00",
         "meta-reference-01",
     }
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 384e2c3c8..308081415 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -13,7 +13,7 @@ from llama_stack.providers.datatypes import Api, ProviderSpec
 
 
 def stack_apis() -> List[Api]:
-    return [v for v in Api]
+    return list(Api)
 
 
 class AutoRoutedApiInfo(BaseModel):
@@ -55,7 +55,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
 
 
 def providable_apis() -> List[Api]:
-    routing_table_apis = set(x.routing_table_api for x in builtin_automatically_routed_apis())
+    routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
     return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
 
 
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 0bc2e774c..69a096e97 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -115,8 +115,8 @@ async def resolve_impls(
     - flatmaps, sorts and resolves the providers in dependency order
     - for each API, produces either a (local, passthrough or router) implementation
     """
-    routing_table_apis = set(x.routing_table_api for x in builtin_automatically_routed_apis())
-    router_apis = set(x.router_api for x in builtin_automatically_routed_apis())
+    routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
+    router_apis = {x.router_api for x in builtin_automatically_routed_apis()}
 
     providers_with_specs = {}
 
diff --git a/llama_stack/distribution/ui/page/playground/rag.py b/llama_stack/distribution/ui/page/playground/rag.py
index 202c9322f..4a916321d 100644
--- a/llama_stack/distribution/ui/page/playground/rag.py
+++ b/llama_stack/distribution/ui/page/playground/rag.py
@@ -134,7 +134,7 @@ def rag_chat_page():
             dict(
                 name="builtin::rag/knowledge_search",
                 args={
-                    "vector_db_ids": [vector_db_id for vector_db_id in selected_vector_dbs],
+                    "vector_db_ids": list(selected_vector_dbs),
                 },
             )
         ],
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 2a93e7b3f..4d0d8ed45 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -797,10 +797,10 @@ class ChatAgent(ShieldRunnerMixin):
         self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None
     ) -> Tuple[List[ToolDefinition], Dict[str, str]]:
         # Determine which tools to include
-        agent_config_toolgroups = set(
-            (toolgroup.name if isinstance(toolgroup, AgentToolGroupWithArgs) else toolgroup)
+        agent_config_toolgroups = {
+            toolgroup.name if isinstance(toolgroup, AgentToolGroupWithArgs) else toolgroup
             for toolgroup in self.agent_config.toolgroups
-        )
+        }
         toolgroups_for_turn_set = (
             agent_config_toolgroups
             if toolgroups_for_turn is None
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 18d408a31..48157b018 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -86,7 +86,6 @@ class MetaReferenceEvalImpl(
     ) -> Job:
         task_def = self.benchmarks[benchmark_id]
         dataset_id = task_def.dataset_id
-        candidate = task_config.eval_candidate
         scoring_functions = task_def.scoring_functions
         dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
         validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.eval.value))
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 763d9664d..516ac1ad8 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -208,7 +208,6 @@ class MetaReferenceInferenceImpl(
             logprobs = []
             stop_reason = None
 
-            tokenizer = self.generator.formatter.tokenizer
             for token_result in self.generator.completion(request):
                 tokens.append(token_result.token)
                 if token_result.text == "<|eot_id|>":
diff --git a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
index 658267f7f..91d0445ab 100644
--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@@ -207,7 +207,7 @@ def maybe_parse_message(maybe_json: Optional[str]) -> Optional[ProcessingMessage
         return parse_message(maybe_json)
     except json.JSONDecodeError:
         return None
-    except ValueError as e:
+    except ValueError:
         return None
 
 
@@ -352,7 +352,7 @@ class ModelParallelProcessGroup:
                 if isinstance(obj, TaskResponse):
                     yield obj.result
 
-        except GeneratorExit as e:
+        except GeneratorExit:
             self.request_socket.send(encode_msg(CancelSentinel()))
             while True:
                 obj_json = self.request_socket.send()
diff --git a/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py b/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
index 014a26f09..cecb66dd3 100644
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
@@ -7,6 +7,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
 
+# The file gets a special treatment for now?
+# ruff: noqa: N803
+
 import unittest
 
 import torch
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index 41387474f..c88787f18 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -264,7 +264,7 @@ class LoraFinetuningSingleDevice:
             )
 
         self.adapter_params = get_adapter_params(model)
-        self._is_dora = any(["magnitude" in k for k in self.adapter_params.keys()])
+        self._is_dora = any("magnitude" in k for k in self.adapter_params.keys())
 
         set_trainable_params(model, self.adapter_params)
 
diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
index be0f023f3..a48b6b58b 100644
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@@ -133,7 +133,7 @@ class BraintrustScoringImpl(
     async def shutdown(self) -> None: ...
 
     async def list_scoring_functions(self) -> List[ScoringFn]:
-        scoring_fn_defs_list = [x for x in self.supported_fn_defs_registry.values()]
+        scoring_fn_defs_list = list(self.supported_fn_defs_registry.values())
         for f in scoring_fn_defs_list:
             assert f.identifier.startswith("braintrust"), (
                 "All braintrust scoring fn must have identifier prefixed with 'braintrust'! "
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 2ca7dd578..db9e176ee 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -198,7 +198,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         tool_config: Optional[ToolConfig] = None,
     ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
         if tool_prompt_format:
-            warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring")
+            warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring", stacklevel=2)
 
         await check_health(self._config)  # this raises errors
 
diff --git a/llama_stack/providers/remote/inference/nvidia/openai_utils.py b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
index 1849fda6d..0582cb816 100644
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@@ -106,7 +106,7 @@ async def convert_chat_completion_request(
             payload.update(temperature=strategy.temperature)
         elif isinstance(strategy, TopKSamplingStrategy):
             if strategy.top_k != -1 and strategy.top_k < 1:
-                warnings.warn("top_k must be -1 or >= 1")
+                warnings.warn("top_k must be -1 or >= 1", stacklevel=2)
             nvext.update(top_k=strategy.top_k)
         elif isinstance(strategy, GreedySamplingStrategy):
             nvext.update(top_k=-1)
@@ -168,7 +168,7 @@ def convert_completion_request(
             payload.update(top_p=request.sampling_params.top_p)
         elif request.sampling_params.strategy == "top_k":
             if request.sampling_params.top_k != -1 and request.sampling_params.top_k < 1:
-                warnings.warn("top_k must be -1 or >= 1")
+                warnings.warn("top_k must be -1 or >= 1", stacklevel=2)
             nvext.update(top_k=request.sampling_params.top_k)
         elif request.sampling_params.strategy == "greedy":
             nvext.update(top_k=-1)
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index ad80b8601..9ce3a972b 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -39,12 +39,11 @@ class Testeval:
 
     @pytest.mark.asyncio
     async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
-        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl = (
             eval_stack[Api.eval],
             eval_stack[Api.benchmarks],
             eval_stack[Api.datasetio],
             eval_stack[Api.datasets],
-            eval_stack[Api.models],
         )
 
         await register_dataset(datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval")
@@ -92,11 +91,10 @@ class Testeval:
 
     @pytest.mark.asyncio
     async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
-        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl = (
             eval_stack[Api.eval],
             eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
-            eval_stack[Api.models],
         )
 
         await register_dataset(datasets_impl, for_generation=True, dataset_id="test_dataset_for_eval")
@@ -131,11 +129,10 @@ class Testeval:
 
     @pytest.mark.asyncio
     async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
-        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl = (
             eval_stack[Api.eval],
             eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
-            eval_stack[Api.models],
         )
 
         response = await datasets_impl.list_datasets()
diff --git a/llama_stack/providers/tests/report.py b/llama_stack/providers/tests/report.py
index febd13045..c9a7f69a8 100644
--- a/llama_stack/providers/tests/report.py
+++ b/llama_stack/providers/tests/report.py
@@ -18,54 +18,48 @@ from llama_stack.models.llama.sku_list import all_registered_models
 INFERENCE_APIS = ["chat_completion"]
 FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"]
 SUPPORTED_MODELS = {
-    "ollama": set(
-        [
-            CoreModelId.llama3_1_8b_instruct.value,
-            CoreModelId.llama3_1_8b_instruct.value,
-            CoreModelId.llama3_1_70b_instruct.value,
-            CoreModelId.llama3_1_70b_instruct.value,
-            CoreModelId.llama3_1_405b_instruct.value,
-            CoreModelId.llama3_1_405b_instruct.value,
-            CoreModelId.llama3_2_1b_instruct.value,
-            CoreModelId.llama3_2_1b_instruct.value,
-            CoreModelId.llama3_2_3b_instruct.value,
-            CoreModelId.llama3_2_3b_instruct.value,
-            CoreModelId.llama3_2_11b_vision_instruct.value,
-            CoreModelId.llama3_2_11b_vision_instruct.value,
-            CoreModelId.llama3_2_90b_vision_instruct.value,
-            CoreModelId.llama3_2_90b_vision_instruct.value,
-            CoreModelId.llama3_3_70b_instruct.value,
-            CoreModelId.llama_guard_3_8b.value,
-            CoreModelId.llama_guard_3_1b.value,
-        ]
-    ),
-    "fireworks": set(
-        [
-            CoreModelId.llama3_1_8b_instruct.value,
-            CoreModelId.llama3_1_70b_instruct.value,
-            CoreModelId.llama3_1_405b_instruct.value,
-            CoreModelId.llama3_2_1b_instruct.value,
-            CoreModelId.llama3_2_3b_instruct.value,
-            CoreModelId.llama3_2_11b_vision_instruct.value,
-            CoreModelId.llama3_2_90b_vision_instruct.value,
-            CoreModelId.llama3_3_70b_instruct.value,
-            CoreModelId.llama_guard_3_8b.value,
-            CoreModelId.llama_guard_3_11b_vision.value,
-        ]
-    ),
-    "together": set(
-        [
-            CoreModelId.llama3_1_8b_instruct.value,
-            CoreModelId.llama3_1_70b_instruct.value,
-            CoreModelId.llama3_1_405b_instruct.value,
-            CoreModelId.llama3_2_3b_instruct.value,
-            CoreModelId.llama3_2_11b_vision_instruct.value,
-            CoreModelId.llama3_2_90b_vision_instruct.value,
-            CoreModelId.llama3_3_70b_instruct.value,
-            CoreModelId.llama_guard_3_8b.value,
-            CoreModelId.llama_guard_3_11b_vision.value,
-        ]
-    ),
+    "ollama": {
+        CoreModelId.llama3_1_8b_instruct.value,
+        CoreModelId.llama3_1_8b_instruct.value,
+        CoreModelId.llama3_1_70b_instruct.value,
+        CoreModelId.llama3_1_70b_instruct.value,
+        CoreModelId.llama3_1_405b_instruct.value,
+        CoreModelId.llama3_1_405b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+        CoreModelId.llama3_3_70b_instruct.value,
+        CoreModelId.llama_guard_3_8b.value,
+        CoreModelId.llama_guard_3_1b.value,
+    },
+    "fireworks": {
+        CoreModelId.llama3_1_8b_instruct.value,
+        CoreModelId.llama3_1_70b_instruct.value,
+        CoreModelId.llama3_1_405b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+        CoreModelId.llama3_3_70b_instruct.value,
+        CoreModelId.llama_guard_3_8b.value,
+        CoreModelId.llama_guard_3_11b_vision.value,
+    },
+    "together": {
+        CoreModelId.llama3_1_8b_instruct.value,
+        CoreModelId.llama3_1_70b_instruct.value,
+        CoreModelId.llama3_1_405b_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+        CoreModelId.llama3_3_70b_instruct.value,
+        CoreModelId.llama_guard_3_8b.value,
+        CoreModelId.llama_guard_3_11b_vision.value,
+    },
 }
 
 
diff --git a/llama_stack/providers/tests/scoring/test_scoring.py b/llama_stack/providers/tests/scoring/test_scoring.py
index e98fd8627..d80b105f4 100644
--- a/llama_stack/providers/tests/scoring/test_scoring.py
+++ b/llama_stack/providers/tests/scoring/test_scoring.py
@@ -45,13 +45,11 @@ class TestScoring:
             scoring_functions_impl,
             datasetio_impl,
             datasets_impl,
-            models_impl,
         ) = (
             scoring_stack[Api.scoring],
             scoring_stack[Api.scoring_functions],
             scoring_stack[Api.datasetio],
             scoring_stack[Api.datasets],
-            scoring_stack[Api.models],
         )
         scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
         provider_id = scoring_fns_list[0].provider_id
@@ -102,13 +100,11 @@ class TestScoring:
             scoring_functions_impl,
             datasetio_impl,
             datasets_impl,
-            models_impl,
         ) = (
             scoring_stack[Api.scoring],
             scoring_stack[Api.scoring_functions],
             scoring_stack[Api.datasetio],
             scoring_stack[Api.datasets],
-            scoring_stack[Api.models],
         )
         await register_dataset(datasets_impl, for_rag=True)
         response = await datasets_impl.list_datasets()
@@ -163,13 +159,11 @@ class TestScoring:
             scoring_functions_impl,
             datasetio_impl,
             datasets_impl,
-            models_impl,
         ) = (
             scoring_stack[Api.scoring],
             scoring_stack[Api.scoring_functions],
             scoring_stack[Api.datasetio],
             scoring_stack[Api.datasets],
-            scoring_stack[Api.models],
         )
         await register_dataset(datasets_impl, for_rag=True)
         rows = await datasetio_impl.get_rows_paginated(
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index d0fdf6385..98c2bfd2e 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -605,7 +605,7 @@ def convert_tool_call(
             tool_name=tool_call.function.name,
             arguments=json.loads(tool_call.function.arguments),
         )
-    except Exception as e:
+    except Exception:
         return UnparseableToolCall(
             call_id=tool_call.id or "",
             tool_name=tool_call.function.name or "",
@@ -876,7 +876,9 @@ async def convert_openai_chat_completion_stream(
             # it is possible to have parallel tool calls in stream, but
             # ChatCompletionResponseEvent only supports one per stream
             if len(choice.delta.tool_calls) > 1:
-                warnings.warn("multiple tool calls found in a single delta, using the first, ignoring the rest")
+                warnings.warn(
+                    "multiple tool calls found in a single delta, using the first, ignoring the rest", stacklevel=2
+                )
 
             if not enable_incremental_tool_calls:
                 yield ChatCompletionResponseStreamChunk(
diff --git a/llama_stack/providers/utils/kvstore/redis/redis.py b/llama_stack/providers/utils/kvstore/redis/redis.py
index f5254198b..a390ea866 100644
--- a/llama_stack/providers/utils/kvstore/redis/redis.py
+++ b/llama_stack/providers/utils/kvstore/redis/redis.py
@@ -36,7 +36,7 @@ class RedisKVStoreImpl(KVStore):
         value = await self.redis.get(key)
         if value is None:
             return None
-        ttl = await self.redis.ttl(key)
+        await self.redis.ttl(key)
         return value
 
     async def delete(self, key: str) -> None:
diff --git a/llama_stack/providers/utils/scoring/aggregation_utils.py b/llama_stack/providers/utils/scoring/aggregation_utils.py
index 35c4ee180..6686e4ade 100644
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@@ -32,7 +32,7 @@ def aggregate_categorical_count(
     scoring_results: List[ScoringResultRow],
 ) -> Dict[str, Any]:
     scores = [str(r["score"]) for r in scoring_results]
-    unique_scores = sorted(list(set(scores)))
+    unique_scores = sorted(set(scores))
     return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
 
 
diff --git a/llama_stack/providers/utils/scoring/base_scoring_fn.py b/llama_stack/providers/utils/scoring/base_scoring_fn.py
index a741e5baa..d28c57cc1 100644
--- a/llama_stack/providers/utils/scoring/base_scoring_fn.py
+++ b/llama_stack/providers/utils/scoring/base_scoring_fn.py
@@ -66,7 +66,7 @@ class RegisteredBaseScoringFn(BaseScoringFn):
         return self.__class__.__name__
 
     def get_supported_scoring_fn_defs(self) -> List[ScoringFn]:
-        return [x for x in self.supported_fn_defs_registry.values()]
+        return list(self.supported_fn_defs_registry.values())
 
     def register_scoring_fn_def(self, scoring_fn: ScoringFn) -> None:
         if scoring_fn.identifier in self.supported_fn_defs_registry:
diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index 76c7283eb..92c82983e 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -99,7 +99,7 @@ def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[
             template = template_func()
             normal_deps, special_deps = get_provider_dependencies(template.providers)
             # Combine all dependencies in order: normal deps, special deps, server deps
-            all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted(list(set(special_deps)))
+            all_deps = sorted(set(normal_deps + SERVER_DEPENDENCIES)) + sorted(set(special_deps))
 
             return template.name, all_deps
     except Exception:
diff --git a/pyproject.toml b/pyproject.toml
index dc5659f06..893aa3330 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -123,39 +123,16 @@ select = [
     "I",  # isort
 ]
 ignore = [
-    "E203",
-    "E305",
-    "E402",
-    "E501", # line too long
-    "E721",
-    "E741",
-    "F405",
-    "F841",
-    "C408", # ignored because we like the dict keyword argument syntax
-    "E302",
-    "W291",
-    "E303",
-    "N812", # ignored because import torch.nn.functional as F is PyTorch convention
-    "N817", # ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
-    "E731", # allow usage of assigning lambda expressions
+    # The following ignores are desired by the project maintainers.
+    "E402", # Module level import not at top of file
+    "E501", # Line too long
+    "F405", # Maybe undefined or defined from star import
+    "C408", # Ignored because we like the dict keyword argument syntax
+    "N812", # Ignored because import torch.nn.functional as F is PyTorch convention
+
     # These are the additional ones we started ignoring after moving to ruff. We should look into each one of them later.
-    "C901",
-    "C405",
-    "C414",
-    "N803",
-    "N999",
-    "C403",
-    "C416",
-    "B028",
-    "C419",
-    "C401",
-    "B023",
-    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
-    # to line this up with executable bit
-    "EXE001",
-    "N802",   # random naming hints don't need
+    "C901", # Complexity of the function is too high
     # these ignores are from flake8-bugbear; please fix!
-    "B007",
     "B008",
 ]
 
diff --git a/tests/client-sdk/__init__.py b/tests/client-sdk/__init__.py
index 756f351d8..ce038c94b 100644
--- a/tests/client-sdk/__init__.py
+++ b/tests/client-sdk/__init__.py
@@ -3,3 +3,4 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+# ruff: noqa: N999
diff --git a/tests/client-sdk/agents/__init__.py b/tests/client-sdk/agents/__init__.py
index 756f351d8..ce038c94b 100644
--- a/tests/client-sdk/agents/__init__.py
+++ b/tests/client-sdk/agents/__init__.py
@@ -3,3 +3,4 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+# ruff: noqa: N999
diff --git a/tests/client-sdk/conftest.py b/tests/client-sdk/conftest.py
index c0f4dca53..3ecf45086 100644
--- a/tests/client-sdk/conftest.py
+++ b/tests/client-sdk/conftest.py
@@ -117,7 +117,7 @@ def client_with_models(llama_stack_client, text_model_id, vision_model_id, embed
     assert len(providers) > 0, "No inference providers found"
     inference_providers = [p.provider_id for p in providers if p.provider_type != "inline::sentence-transformers"]
 
-    model_ids = set(m.identifier for m in client.models.list())
+    model_ids = {m.identifier for m in client.models.list()}
     model_ids.update(m.provider_resource_id for m in client.models.list())
 
     if text_model_id and text_model_id not in model_ids:
diff --git a/tests/client-sdk/inference/__init__.py b/tests/client-sdk/inference/__init__.py
index 756f351d8..ce038c94b 100644
--- a/tests/client-sdk/inference/__init__.py
+++ b/tests/client-sdk/inference/__init__.py
@@ -3,3 +3,4 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+# ruff: noqa: N999
diff --git a/tests/client-sdk/inference/test_embedding.py b/tests/client-sdk/inference/test_embedding.py
index c46a6517f..69d35d05d 100644
--- a/tests/client-sdk/inference/test_embedding.py
+++ b/tests/client-sdk/inference/test_embedding.py
@@ -176,7 +176,7 @@ def test_embedding_truncation_error(
 ):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    with pytest.raises(BadRequestError) as excinfo:
+    with pytest.raises(BadRequestError):
         llama_stack_client.inference.embeddings(
             model_id=embedding_model_id, contents=[DUMMY_LONG_TEXT], text_truncation=text_truncation
         )
@@ -243,7 +243,7 @@ def test_embedding_text_truncation_error(
 ):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    with pytest.raises(BadRequestError) as excinfo:
+    with pytest.raises(BadRequestError):
         llama_stack_client.inference.embeddings(
             model_id=embedding_model_id, contents=[DUMMY_STRING], text_truncation=text_truncation
         )
diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py
index 7850d2d57..63813a1cc 100644
--- a/tests/client-sdk/inference/test_text_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@@ -139,7 +139,7 @@ def test_text_completion_log_probs_streaming(client_with_models, text_model_id,
             "top_k": 1,
         },
     )
-    streamed_content = [chunk for chunk in response]
+    streamed_content = list(response)
     for chunk in streamed_content:
         if chunk.delta:  # if there's a token, we expect logprobs
             assert chunk.logprobs, "Logprobs should not be empty"
@@ -405,7 +405,7 @@ def test_text_chat_completion_tool_calling_tools_not_in_request(
                 assert delta.tool_call.tool_name == "get_object_namespace_list"
             if delta.type == "tool_call" and delta.parse_status == "failed":
                 # expect raw message that failed to parse in tool_call
-                assert type(delta.tool_call) == str
+                assert isinstance(delta.tool_call, str)
                 assert len(delta.tool_call) > 0
     else:
         for tc in response.completion_message.tool_calls:
diff --git a/tests/client-sdk/report.py b/tests/client-sdk/report.py
index b946b85ba..0151b3d20 100644
--- a/tests/client-sdk/report.py
+++ b/tests/client-sdk/report.py
@@ -42,29 +42,27 @@ def featured_models():
 
 
 SUPPORTED_MODELS = {
-    "ollama": set(
-        [
-            CoreModelId.llama3_1_8b_instruct.value,
-            CoreModelId.llama3_1_8b_instruct.value,
-            CoreModelId.llama3_1_70b_instruct.value,
-            CoreModelId.llama3_1_70b_instruct.value,
-            CoreModelId.llama3_1_405b_instruct.value,
-            CoreModelId.llama3_1_405b_instruct.value,
-            CoreModelId.llama3_2_1b_instruct.value,
-            CoreModelId.llama3_2_1b_instruct.value,
-            CoreModelId.llama3_2_3b_instruct.value,
-            CoreModelId.llama3_2_3b_instruct.value,
-            CoreModelId.llama3_2_11b_vision_instruct.value,
-            CoreModelId.llama3_2_11b_vision_instruct.value,
-            CoreModelId.llama3_2_90b_vision_instruct.value,
-            CoreModelId.llama3_2_90b_vision_instruct.value,
-            CoreModelId.llama3_3_70b_instruct.value,
-            CoreModelId.llama_guard_3_8b.value,
-            CoreModelId.llama_guard_3_1b.value,
-        ]
-    ),
-    "tgi": set([model.core_model_id.value for model in all_registered_models() if model.huggingface_repo]),
-    "vllm": set([model.core_model_id.value for model in all_registered_models() if model.huggingface_repo]),
+    "ollama": {
+        CoreModelId.llama3_1_8b_instruct.value,
+        CoreModelId.llama3_1_8b_instruct.value,
+        CoreModelId.llama3_1_70b_instruct.value,
+        CoreModelId.llama3_1_70b_instruct.value,
+        CoreModelId.llama3_1_405b_instruct.value,
+        CoreModelId.llama3_1_405b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+        CoreModelId.llama3_2_90b_vision_instruct.value,
+        CoreModelId.llama3_3_70b_instruct.value,
+        CoreModelId.llama_guard_3_8b.value,
+        CoreModelId.llama_guard_3_1b.value,
+    },
+    "tgi": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
+    "vllm": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
 }
 
 
diff --git a/tests/client-sdk/safety/__init__.py b/tests/client-sdk/safety/__init__.py
index 756f351d8..ce038c94b 100644
--- a/tests/client-sdk/safety/__init__.py
+++ b/tests/client-sdk/safety/__init__.py
@@ -3,3 +3,4 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+# ruff: noqa: N999
diff --git a/tests/client-sdk/safety/test_safety.py b/tests/client-sdk/safety/test_safety.py
index 1417a9c06..79963e4d4 100644
--- a/tests/client-sdk/safety/test_safety.py
+++ b/tests/client-sdk/safety/test_safety.py
@@ -42,7 +42,7 @@ def code_scanner_shield_id(available_shields):
 
 @pytest.fixture(scope="session")
 def model_providers(llama_stack_client):
-    return set([x.provider_id for x in llama_stack_client.providers.list() if x.api == "inference"])
+    return {x.provider_id for x in llama_stack_client.providers.list() if x.api == "inference"}
 
 
 def test_unsafe_examples(llama_stack_client, llama_guard_text_shield_id):
diff --git a/tests/client-sdk/vector_io/__init__.py b/tests/client-sdk/vector_io/__init__.py
index 756f351d8..ce038c94b 100644
--- a/tests/client-sdk/vector_io/__init__.py
+++ b/tests/client-sdk/vector_io/__init__.py
@@ -3,3 +3,4 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+# ruff: noqa: N999

From 18ab1985da2cb461772bd9a4501a7803555eaa1f Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 28 Feb 2025 12:48:49 -0500
Subject: [PATCH 12/22] fix: Make remote::vllm compatible with vLLM <= v0.6.3
 (#1325)

# What does this PR do?

This is to be consistent with OpenAI API and support vLLM <= v0.6.3

References:
*
https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
* https://github.com/vllm-project/vllm/pull/10000

This fixes the error when running older versions of vLLM:

```
00:50:19.834 [START] /v1/inference/chat-completion
INFO 2025-02-28 00:50:20,203 httpx:1025: HTTP Request: POST https://api-xeai-granite-3-1-8b-instruct.apps.int.stc.ai.preprod.us-east-1.aws.paas.redhat.com/v1/chat/completions "HTTP/1.1 400 Bad Request"
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/server/server.py", line 235, in endpoint
    return await maybe_await(value)
  File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/server/server.py", line 201, in maybe_await
    return await value
  File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/utils/telemetry/trace_protocol.py", line 89, in async_wrapper
    result = await method(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/llama_stack/distribution/routers/routers.py", line 193, in chat_completion
    return await provider.chat_completion(**params)
  File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/utils/telemetry/trace_protocol.py", line 89, in async_wrapper
    result = await method(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/remote/inference/vllm/vllm.py", line 286, in chat_completion
    return await self._nonstream_chat_completion(request, self.client)
  File "/usr/local/lib/python3.10/site-packages/llama_stack/providers/remote/inference/vllm/vllm.py", line 292, in _nonstream_chat_completion
    r = client.chat.completions.create(**params)
  File "/usr/local/lib/python3.10/site-packages/openai/_utils/_utils.py", line 279, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/openai/resources/chat/completions/completions.py", line 879, in create
    return self._post(
  File "/usr/local/lib/python3.10/site-packages/openai/_base_client.py", line 1290, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
  File "/usr/local/lib/python3.10/site-packages/openai/_base_client.py", line 967, in request
    return self._request(
  File "/usr/local/lib/python3.10/site-packages/openai/_base_client.py", line 1071, in _request
    raise self._make_status_error_from_response(err.response) from None
openai.BadRequestError: Error code: 400 - {'object': 'error', 'message': "[{'type': 'value_error', 'loc': ('body',), 'msg': 'Value error, When using `tool_choice`, `tools` must be set.', 'input': {'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': 'What model are you?'}]}], 'model': 'granite-3-1-8b-instruct', 'max_tokens': 4096, 'stream': False, 'temperature': 0.0, 'tools': None, 'tool_choice': 'auto'}, 'ctx': {'error': ValueError('When using `tool_choice`, `tools` must be set.')}}]", 'type': 'BadRequestError', 'param': None, 'code': 400}
INFO:     2600:1700:9d20:ac0::49:59736 - "POST /v1/inference/chat-completion HTTP/1.1" 500 Internal Server Error
00:50:20.266 [END] /v1/inference/chat-completion [StatusCode.OK] (431.99ms)
```

## Test Plan

All existing tests pass.

---------

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 llama_stack/providers/remote/inference/vllm/vllm.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 967a3e44d..8ec23cd90 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -270,6 +270,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         tool_config: Optional[ToolConfig] = None,
     ) -> AsyncGenerator:
         model = await self.model_store.get_model(model_id)
+        # This is to be consistent with OpenAI API and support vLLM <= v0.6.3
+        # References:
+        #   * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
+        #   * https://github.com/vllm-project/vllm/pull/10000
+        if not tools and tool_config is not None:
+            tool_config.tool_choice = ToolChoice.none
         request = ChatCompletionRequest(
             model=model.provider_resource_id,
             messages=messages,

From c91548fe07ca7ec0fa33cf82443e165594abda9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 28 Feb 2025 19:01:52 +0100
Subject: [PATCH 13/22] build(container): misc improvements (#1291)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

See individual commit messages.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Apply this diff:

```
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index da33b8d5..4a702f6f 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -28,5 +28,5 @@ distribution_spec:
     - remote::tavily-search
     - inline::code-interpreter
     - inline::rag-runtime
-    - remote::model-context-protocol
+  container_image: "registry.access.redhat.com/ubi9"
 image_type: conda
```

Then run:

```
CONTAINER_BINARY=podman llama stack build --template ollama --image-type container --image-name registry.access.redhat.com/ubi9
Containerfile created successfully in /var/folders/mq/rnm5w_7s2d3fxmtkx02knvhm0000gn/T/tmp.I7E5V6zbVI/Containerfile

FROM registry.access.redhat.com/ubi9
WORKDIR /app

RUN dnf -y update && dnf install -y iputils net-tools wget     vim-minimal python3.11 python3.11-pip python3.11-wheel     python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all

ENV UV_SYSTEM_PYTHON=1
RUN pip install uv
RUN uv pip install --no-cache ollama nltk opentelemetry-sdk aiosqlite matplotlib datasets sqlite-vec scipy chromadb-client psycopg2-binary numpy scikit-learn openai redis pandas tqdm blobfile sentencepiece aiohttp requests pillow pymongo transformers autoevals opentelemetry-exporter-otlp-proto-http pypdf chardet aiosqlite fastapi fire httpx uvicorn
RUN uv pip install --no-cache llama-stack
RUN pip uninstall -y uv
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "ollama"]

# Allows running as non-root user
RUN mkdir -p /.llama /.cache

RUN chmod -R g+rw /app /.llama /.cache

PWD: /Users/leseb/Documents/AI/llama-stack
Containerfile: /var/folders/mq/rnm5w_7s2d3fxmtkx02knvhm0000gn/T/tmp.I7E5V6zbVI/Containerfile
+ podman build --platform linux/arm64 -t distribution-ollama:0.1.4 -f /var/folders/mq/rnm5w_7s2d3fxmtkx02knvhm0000gn/T/tmp.I7E5V6zbVI/Containerfile . --progress=plain
STEP 1/11: FROM registry.access.redhat.com/ubi9
STEP 2/11: WORKDIR /app
--> Using cache d73dafd4caddd75bc29242a9031258fea759dc571c5bb53a64b5e6d86b3b1335
--> d73dafd4cadd
STEP 3/11: RUN dnf -y update && dnf install -y iputils net-tools wget     vim-minimal python3.11 python3.11-pip python3.11-wheel     python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all
--> Using cache b74ad682db149771612a3ea1e4796e0760ab8a4e07c26ad672b46a86d38178c2
--> b74ad682db14
STEP 4/11: ENV UV_SYSTEM_PYTHON=1
--> Using cache 0812a05e6576506aa2fe646cbf239d0cb504cac30a50cb5cf4dc88e49039466d
--> 0812a05e6576
STEP 5/11: RUN pip install uv
--> Using cache a0ce1705f87e52f70f6eb34e66f67b68ebc7c1a073f4d2a664b189cfa89a4e88
--> a0ce1705f87e
STEP 6/11: RUN uv pip install --no-cache ollama nltk opentelemetry-sdk aiosqlite matplotlib datasets sqlite-vec scipy chromadb-client psycopg2-binary numpy scikit-learn openai redis pandas tqdm blobfile sentencepiece aiohttp requests pillow pymongo transformers autoevals opentelemetry-exporter-otlp-proto-http pypdf chardet aiosqlite fastapi fire httpx uvicorn
Using Python 3.11.9 environment at: /usr
Resolved 107 packages in 1.78s
Downloading kiwisolver (1.4MiB)
Downloading aiohttp (1.6MiB)
Downloading grpcio (5.4MiB)
Downloading nltk (1.4MiB)
Downloading transformers (9.5MiB)
Downloading pydantic-core (1.7MiB)
Downloading lxml (4.6MiB)
Downloading psycopg2-binary (2.7MiB)
Downloading scipy (33.8MiB)
Downloading scikit-learn (12.0MiB)
Downloading tokenizers (2.8MiB)
Downloading fonttools (4.6MiB)
Downloading pymongo (1.3MiB)
Downloading rapidfuzz (1.4MiB)
Downloading sentencepiece (1.2MiB)
Downloading pyarrow (38.7MiB)
Downloading matplotlib (8.1MiB)
Downloading pycryptodomex (2.1MiB)
Downloading pillow (4.2MiB)
Downloading pandas (14.9MiB)
Downloading numpy (13.6MiB)
   Building fire==0.7.0
 Downloaded sentencepiece
 Downloaded kiwisolver
 Downloaded pymongo
 Downloaded rapidfuzz
 Downloaded nltk
 Downloaded aiohttp
      Built fire==0.7.0
 Downloaded pydantic-core
 Downloaded pycryptodomex
 Downloaded psycopg2-binary
 Downloaded tokenizers
 Downloaded pillow
 Downloaded lxml
 Downloaded fonttools
 Downloaded grpcio
 Downloaded matplotlib
 Downloaded transformers
 Downloaded scikit-learn
 Downloaded numpy
 Downloaded pandas
 Downloaded scipy
 Downloaded pyarrow
Prepared 107 packages in 3.03s
Installed 107 packages in 62ms
 + aiohappyeyeballs==2.4.6
 + aiohttp==3.11.13
 + aiosignal==1.3.2
 + aiosqlite==0.21.0
 + annotated-types==0.7.0
 + anyio==4.8.0
 + attrs==25.1.0
 + autoevals==0.0.120
 + backoff==2.2.1
 + blobfile==3.0.0
 + braintrust-core==0.0.58
 + certifi==2025.1.31
 + chardet==5.2.0
 + charset-normalizer==3.4.1
 + chevron==0.14.0
 + chromadb-client==0.6.3
 + click==8.1.8
 + contourpy==1.3.1
 + cycler==0.12.1
 + datasets==3.3.2
 + deprecated==1.2.18
 + dill==0.3.8
 + distro==1.9.0
 + dnspython==2.7.0
 + fastapi==0.115.8
 + filelock==3.17.0
 + fire==0.7.0
 + fonttools==4.56.0
 + frozenlist==1.5.0
 + fsspec==2024.12.0
 + googleapis-common-protos==1.68.0
 + grpcio==1.70.0
 + h11==0.14.0
 + httpcore==1.0.7
 + httpx==0.28.1
 + huggingface-hub==0.29.1
 + idna==3.10
 + importlib-metadata==8.5.0
 + jiter==0.8.2
 + joblib==1.4.2
 + jsonschema==4.23.0
 + jsonschema-specifications==2024.10.1
 + kiwisolver==1.4.8
 + levenshtein==0.26.1
 + lxml==5.3.1
 + matplotlib==3.10.0
 + monotonic==1.6
 + multidict==6.1.0
 + multiprocess==0.70.16
 + nltk==3.9.1
 + numpy==1.26.4
 + ollama==0.4.7
 + openai==1.64.0
 + opentelemetry-api==1.30.0
 + opentelemetry-exporter-otlp-proto-common==1.30.0
 + opentelemetry-exporter-otlp-proto-grpc==1.30.0
 + opentelemetry-exporter-otlp-proto-http==1.30.0
 + opentelemetry-proto==1.30.0
 + opentelemetry-sdk==1.30.0
 + opentelemetry-semantic-conventions==0.51b0
 + orjson==3.10.15
 + overrides==7.7.0
 + packaging==24.2
 + pandas==2.2.3
 + pillow==11.1.0
 + posthog==3.16.0
 + propcache==0.3.0
 + protobuf==5.29.3
 + psycopg2-binary==2.9.10
 + pyarrow==19.0.1
 + pycryptodomex==3.21.0
 + pydantic==2.10.6
 + pydantic-core==2.27.2
 + pymongo==4.11.1
 + pyparsing==3.2.1
 + pypdf==5.3.0
 + python-dateutil==2.9.0.post0
 + pytz==2025.1
 + pyyaml==6.0.2
 + rapidfuzz==3.12.1
 + redis==5.2.1
 + referencing==0.36.2
 + regex==2024.11.6
 + requests==2.32.3
 + rpds-py==0.23.1
 + safetensors==0.5.3
 + scikit-learn==1.6.1
 + scipy==1.15.2
 + sentencepiece==0.2.0
 + six==1.17.0
 + sniffio==1.3.1
 + sqlite-vec==0.1.6
 + starlette==0.45.3
 + tenacity==9.0.0
 + termcolor==2.5.0
 + threadpoolctl==3.5.0
 + tokenizers==0.21.0
 + tqdm==4.67.1
 + transformers==4.49.0
 + typing-extensions==4.12.2
 + tzdata==2025.1
 + urllib3==2.3.0
 + uvicorn==0.34.0
 + wrapt==1.17.2
 + xxhash==3.5.0
 + yarl==1.18.3
 + zipp==3.21.0
--> 5b5b823605a1
STEP 7/11: RUN uv pip install --no-cache llama-stack
Using Python 3.11.9 environment at: /usr
Resolved 55 packages in 1.08s
Downloading setuptools (1.2MiB)
Downloading pygments (1.2MiB)
Downloading llama-models (1.5MiB)
Downloading tiktoken (1.1MiB)
 Downloaded tiktoken
 Downloaded llama-models
 Downloaded pygments
 Downloaded setuptools
Prepared 15 packages in 402ms
Installed 15 packages in 15ms
 + jinja2==3.1.5
 + llama-models==0.1.4
 + llama-stack==0.1.4
 + llama-stack-client==0.1.4
 + markdown-it-py==3.0.0
 + markupsafe==3.0.2
 + mdurl==0.1.2
 + prompt-toolkit==3.0.50
 + pyaml==25.1.0
 + pygments==2.19.1
 + python-dotenv==1.0.1
 + rich==13.9.4
 + setuptools==75.8.2
 + tiktoken==0.9.0
 + wcwidth==0.2.13
--> 38a037443807
STEP 8/11: RUN pip uninstall -y uv
Found existing installation: uv 0.6.3
Uninstalling uv-0.6.3:
  Successfully uninstalled uv-0.6.3
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
--> 54f749dc5ece
STEP 9/11: ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "ollama"]
--> 481c138b1982
STEP 10/11: RUN mkdir -p /.llama /.cache
--> 0fc174f014a8
STEP 11/11: RUN chmod -R g+rw /app /.llama /.cache
COMMIT distribution-ollama:0.1.4
--> d41b4ab4b136
Successfully tagged localhost/distribution-ollama:0.1.4
d41b4ab4b1363bfbaf6239e6f313bcb37873ef4b5f2fd816a4ee55acf2ac54d3
+ set +x
Success!
Build Successful!
```

UBI9 container successfully builds.

Run the container:

```
podman run d41b4ab4b1363bfbaf6239e6f313bcb37873ef4b5f2fd816a4ee55acf2ac54d3 --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:213: Resolved 30 providers
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  inner-inference => ollama
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  models => __routing_table__
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  inference => __autorouted__
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  inner-vector_io => sqlite-vec
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  inner-safety => llama-guard
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  shields => __routing_table__
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  safety => __autorouted__
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  vector_dbs => __routing_table__
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  vector_io => __autorouted__
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  inner-tool_runtime => brave-search
INFO 2025-02-27 13:08:03,666 llama_stack.distribution.resolver:215:  inner-tool_runtime => tavily-search
```


[//]: # (## Documentation)

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/distribution/build.py           |  3 -
 llama_stack/distribution/build_container.sh | 65 +++++++++++++--------
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py
index 2b43b8128..3d808a4a4 100644
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@@ -15,7 +15,6 @@ from termcolor import cprint
 
 from llama_stack.distribution.datatypes import BuildConfig, Provider
 from llama_stack.distribution.distribution import get_provider_registry
-from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
 from llama_stack.distribution.utils.exec import run_command, run_with_pty
 from llama_stack.distribution.utils.image_types import ImageType
 from llama_stack.providers.datatypes import Api
@@ -103,8 +102,6 @@ def build_image(
             template_or_config,
             image_name,
             container_base,
-            str(build_file_path),
-            str(BUILDS_BASE_DIR / ImageType.container.value),
             " ".join(normal_deps),
         ]
     elif build_config.image_type == ImageType.conda.value:
diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 08941a538..9b584a85c 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
@@ -20,26 +20,27 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 # mounting is not supported by docker buildx, so we use COPY instead
 USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-}
 
-if [ "$#" -lt 6 ]; then
+if [ "$#" -lt 4 ]; then
   # This only works for templates
-  echo "Usage: $0 <template_or_config> <image_name> <container_base> <build_file_path> <host_build_dir> <pip_dependencies> [<special_pip_deps>]" >&2
+  echo "Usage: $0 <template_or_config> <image_name> <container_base> <pip_dependencies> [<special_pip_deps>]" >&2
   exit 1
 fi
 
 set -euo pipefail
 
 template_or_config="$1"
-image_name="$2"
-container_base="$3"
-build_file_path="$4"
-host_build_dir="$5"
-pip_dependencies="$6"
-special_pip_deps="${7:-}"
+shift
+image_name="$1"
+shift
+container_base="$1"
+shift
+pip_dependencies="$1"
+shift
+special_pip_deps="${1:-}"
 
 
 # Define color codes
 RED='\033[0;31m'
-GREEN='\033[0;32m'
 NC='\033[0m' # No Color
 
 CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
@@ -48,7 +49,6 @@ CONTAINER_OPTS=${CONTAINER_OPTS:-}
 TEMP_DIR=$(mktemp -d)
 
 add_to_container() {
-  local input
   output_file="$TEMP_DIR/Containerfile"
   if [ -t 0 ]; then
     printf '%s\n' "$1" >>"$output_file"
@@ -64,9 +64,9 @@ if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
 FROM $container_base
 WORKDIR /app
 
-RUN microdnf -y update && microdnf install -y iputils net-tools wget \
+RUN dnf -y update && dnf install -y iputils net-tools wget \
     vim-minimal python3.11 python3.11-pip python3.11-wheel \
-    python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && microdnf clean all
+    python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all
 
 ENV UV_SYSTEM_PYTHON=1
 RUN pip install uv
@@ -165,6 +165,11 @@ EOF
   fi
 fi
 
+# remove uv after installation
+  add_to_container << EOF
+RUN pip uninstall -y uv
+EOF
+
 # if template_or_config ends with .yaml, it is not a template and we should not use the --template flag
 if [[ "$template_or_config" != *.yaml ]]; then
   add_to_container << EOF
@@ -185,26 +190,31 @@ RUN mkdir -p /.llama /.cache
 RUN chmod -R g+rw /app /.llama /.cache
 EOF
 
-printf "Containerfile created successfully in $TEMP_DIR/Containerfile\n\n"
-cat $TEMP_DIR/Containerfile
+printf "Containerfile created successfully in %s/Containerfile\n\n" "$TEMP_DIR"
+cat "$TEMP_DIR"/Containerfile
 printf "\n"
 
-mounts=""
+# Start building the CLI arguments
+CLI_ARGS=()
+
+# Read CONTAINER_OPTS and put it in an array
+read -ra CLI_ARGS <<< "$CONTAINER_OPTS"
+
 if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then
   if [ -n "$LLAMA_STACK_DIR" ]; then
-    mounts="$mounts -v $(readlink -f $LLAMA_STACK_DIR):$stack_mount"
+    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_DIR"):$stack_mount")
   fi
   if [ -n "$LLAMA_MODELS_DIR" ]; then
-    mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount"
+    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_MODELS_DIR"):$models_mount")
   fi
   if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-    mounts="$mounts -v $(readlink -f $LLAMA_STACK_CLIENT_DIR):$client_mount"
+    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_CLIENT_DIR"):$client_mount")
   fi
 fi
 
 if command -v selinuxenabled &>/dev/null && selinuxenabled; then
   # Disable SELinux labels -- we don't want to relabel the llama-stack source dir
-  CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable"
+  CLI_ARGS+=("--security-opt" "label=disable")
 fi
 
 # Set version tag based on PyPI version
@@ -225,11 +235,11 @@ image_tag="$image_name:$version_tag"
 # Detect platform architecture
 ARCH=$(uname -m)
 if [ -n "$BUILD_PLATFORM" ]; then
-  PLATFORM="--platform $BUILD_PLATFORM"
+  CLI_ARGS+=("--platform $BUILD_PLATFORM")
 elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
-  PLATFORM="--platform linux/arm64"
+  CLI_ARGS+=("--platform" "linux/arm64")
 elif [ "$ARCH" = "x86_64" ]; then
-  PLATFORM="--platform linux/amd64"
+  CLI_ARGS+=("--platform" "linux/amd64")
 else
   echo "Unsupported architecture: $ARCH"
   exit 1
@@ -238,8 +248,13 @@ fi
 echo "PWD: $(pwd)"
 echo "Containerfile: $TEMP_DIR/Containerfile"
 set -x
-$CONTAINER_BINARY build $CONTAINER_OPTS $PLATFORM -t $image_tag \
-  -f "$TEMP_DIR/Containerfile" "." $mounts --progress=plain
+
+$CONTAINER_BINARY build \
+  "${CLI_ARGS[@]}" \
+  -t "$image_tag" \
+  -f "$TEMP_DIR/Containerfile" \
+  "." \
+  --progress=plain
 
 # clean up tmp/configs
 set +x

From 83dc8fbdffdc673dcdd6392ea9f1138fd0a9f412 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Fri, 28 Feb 2025 12:02:36 -0600
Subject: [PATCH 14/22] test: cleanup embedding model test suite (#1322)

# What does this PR do?

 - skip media tests for models that do not support media
 - skip output_dimension tests for models that do not support it
 - skip task_type tests for models that do not support it
 - provide task_type for models that require it

## Test Plan

`LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -v
tests/client-sdk/inference/test_embedding.py --embedding-model ...`
---
 tests/client-sdk/inference/test_embedding.py | 65 ++++++++++++++++----
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/tests/client-sdk/inference/test_embedding.py b/tests/client-sdk/inference/test_embedding.py
index 69d35d05d..075f927f7 100644
--- a/tests/client-sdk/inference/test_embedding.py
+++ b/tests/client-sdk/inference/test_embedding.py
@@ -76,6 +76,25 @@ DUMMY_IMAGE_URL = ImageContentItem(
 )
 DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
 SUPPORTED_PROVIDERS = {"remote::nvidia"}
+MODELS_SUPPORTING_MEDIA = {}
+MODELS_SUPPORTING_OUTPUT_DIMENSION = {"nvidia/llama-3.2-nv-embedqa-1b-v2"}
+MODELS_REQUIRING_TASK_TYPE = {
+    "nvidia/llama-3.2-nv-embedqa-1b-v2",
+    "nvidia/nv-embedqa-e5-v5",
+    "nvidia/nv-embedqa-mistral-7b-v2",
+    "snowflake/arctic-embed-l",
+}
+MODELS_SUPPORTING_TASK_TYPE = MODELS_REQUIRING_TASK_TYPE
+
+
+def default_task_type(model_id):
+    """
+    Some models require a task type parameter. This provides a default value for
+    testing those models.
+    """
+    if model_id in MODELS_REQUIRING_TASK_TYPE:
+        return {"task_type": "query"}
+    return {}
 
 
 @pytest.mark.parametrize(
@@ -92,7 +111,9 @@ SUPPORTED_PROVIDERS = {"remote::nvidia"}
 def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=contents)
+    response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
+    )
     assert isinstance(response, EmbeddingsResponse)
     assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
     assert isinstance(response.embeddings[0], list)
@@ -110,11 +131,14 @@ def test_embedding_text(llama_stack_client, embedding_model_id, contents, infere
         "list[url,string,base64,text]",
     ],
 )
-@pytest.mark.xfail(reason="Media is not supported")
 def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=contents)
+    if embedding_model_id not in MODELS_SUPPORTING_MEDIA:
+        pytest.xfail(f"{embedding_model_id} doesn't support media")
+    response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
+    )
     assert isinstance(response, EmbeddingsResponse)
     assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
     assert isinstance(response.embeddings[0], list)
@@ -145,7 +169,10 @@ def test_embedding_truncation(
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, text_truncation=text_truncation
+        model_id=embedding_model_id,
+        contents=contents,
+        text_truncation=text_truncation,
+        **default_task_type(embedding_model_id),
     )
     assert isinstance(response, EmbeddingsResponse)
     assert len(response.embeddings) == 1
@@ -178,26 +205,36 @@ def test_embedding_truncation_error(
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     with pytest.raises(BadRequestError):
         llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id, contents=[DUMMY_LONG_TEXT], text_truncation=text_truncation
+            model_id=embedding_model_id,
+            contents=[DUMMY_LONG_TEXT],
+            text_truncation=text_truncation,
+            **default_task_type(embedding_model_id),
         )
 
 
-@pytest.mark.xfail(reason="Only valid for model supporting dimension reduction")
 def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    base_response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=[DUMMY_STRING])
+    if embedding_model_id not in MODELS_SUPPORTING_OUTPUT_DIMENSION:
+        pytest.xfail(f"{embedding_model_id} doesn't support output_dimension")
+    base_response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=[DUMMY_STRING], **default_task_type(embedding_model_id)
+    )
     test_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], output_dimension=32
+        model_id=embedding_model_id,
+        contents=[DUMMY_STRING],
+        **default_task_type(embedding_model_id),
+        output_dimension=32,
     )
     assert len(base_response.embeddings[0]) != len(test_response.embeddings[0])
     assert len(test_response.embeddings[0]) == 32
 
 
-@pytest.mark.xfail(reason="Only valid for model supporting task type")
 def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    if embedding_model_id not in MODELS_SUPPORTING_TASK_TYPE:
+        pytest.xfail(f"{embedding_model_id} doesn't support task_type")
     query_embedding = llama_stack_client.inference.embeddings(
         model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
     )
@@ -220,7 +257,10 @@ def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], text_truncation=text_truncation
+        model_id=embedding_model_id,
+        contents=[DUMMY_STRING],
+        text_truncation=text_truncation,
+        **default_task_type(embedding_model_id),
     )
     assert isinstance(response, EmbeddingsResponse)
     assert len(response.embeddings) == 1
@@ -245,5 +285,8 @@ def test_embedding_text_truncation_error(
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
     with pytest.raises(BadRequestError):
         llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id, contents=[DUMMY_STRING], text_truncation=text_truncation
+            model_id=embedding_model_id,
+            contents=[DUMMY_STRING],
+            text_truncation=text_truncation,
+            **default_task_type(embedding_model_id),
         )

From 5366dab31e3dfecab455a9a6c5f55cc18c7c7ae6 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 1 Mar 2025 02:03:45 +0800
Subject: [PATCH 15/22] docs: update build doc (#1262)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]


https://github.com/meta-llama/llama-stack/blob/55eb257459f5f891d7e570740e816eed950131b3/llama_stack/cli/stack/run.py#L22


https://github.com/meta-llama/llama-stack/blob/55eb257459f5f891d7e570740e816eed950131b3/llama_stack/cli/stack/_build.py#L103

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 docs/source/distributions/building_distro.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 9cb1a402f..20a835201 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -106,7 +106,7 @@ It would be best to start with a template and understand the structure of the co
 llama stack build
 
 > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
-> Enter the image type you want your Llama Stack to be built as (container or conda): conda
+> Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda
 
 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
@@ -187,7 +187,7 @@ usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-i
                        [--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}]
                        config
 
-start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
+Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
 
 positional arguments:
   config                Path to config file to use for the run

From ea4f13cc209e1222aadfab52224a48f687a6d483 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 1 Mar 2025 02:07:24 +0800
Subject: [PATCH 16/22] chore: add container cmd check (#1306)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 llama_stack/distribution/build_container.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 9b584a85c..68f8a0863 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -48,6 +48,9 @@ CONTAINER_OPTS=${CONTAINER_OPTS:-}
 
 TEMP_DIR=$(mktemp -d)
 
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
 add_to_container() {
   output_file="$TEMP_DIR/Containerfile"
   if [ -t 0 ]; then
@@ -58,6 +61,12 @@ add_to_container() {
   fi
 }
 
+# Check if container command is available
+if ! is_command_available $CONTAINER_BINARY; then
+  printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
+  exit 1
+fi
+
 # Update and install UBI9 components if UBI9 base image is used
 if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
   add_to_container << EOF
@@ -212,7 +221,7 @@ if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then
   fi
 fi
 
-if command -v selinuxenabled &>/dev/null && selinuxenabled; then
+if is_command_available selinuxenabled && selinuxenabled; then
   # Disable SELinux labels -- we don't want to relabel the llama-stack source dir
   CLI_ARGS+=("--security-opt" "label=disable")
 fi

From 14c442f177591ab336414f0017d4da3d1e20a088 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 1 Mar 2025 02:08:05 +0800
Subject: [PATCH 17/22] chore: update cmd check (#1293)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 llama_stack/distribution/build_conda_env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/distribution/build_conda_env.sh b/llama_stack/distribution/build_conda_env.sh
index 31b3e1b21..1eac2ee08 100755
--- a/llama_stack/distribution/build_conda_env.sh
+++ b/llama_stack/distribution/build_conda_env.sh
@@ -52,7 +52,7 @@ ensure_conda_env_python310() {
   local python_version="3.10"
 
   # Check if conda command is available
-  if ! command -v conda &>/dev/null; then
+  if ! is_command_available conda; then
     printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
     exit 1
   fi

From 66cd128ab51aff0b649c8ae59d7ec139a54913c1 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 1 Mar 2025 02:10:12 +0800
Subject: [PATCH 18/22] docs: update the downloaded list doc (#1266)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

Since released the `--downloaded` option, so update the related
documents.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 .../self_hosted_distro/meta-reference-gpu.md  | 27 +++++++++++++---
 .../meta-reference-quantized-gpu.md           | 27 +++++++++++++---
 .../llama_cli_reference/download_models.md    | 32 +++++++++++++++++++
 .../references/llama_cli_reference/index.md   | 32 +++++++++++++++++++
 .../meta-reference-gpu/doc_template.md        | 27 +++++++++++++---
 .../doc_template.md                           | 27 +++++++++++++---
 6 files changed, 156 insertions(+), 16 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index b183757db..b8d1b1714 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -41,12 +41,31 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
-$ ls ~/.llama/checkpoints
-Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
-Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
+$ llama model list --downloaded
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 
 ## Running the Distribution
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index 9aeb7a88b..a49175e22 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -41,12 +41,31 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
-$ ls ~/.llama/checkpoints
-Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
-Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
+$ llama model list --downloaded
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 
 ## Running the Distribution
diff --git a/docs/source/references/llama_cli_reference/download_models.md b/docs/source/references/llama_cli_reference/download_models.md
index 6c791bcb7..ca470f8c2 100644
--- a/docs/source/references/llama_cli_reference/download_models.md
+++ b/docs/source/references/llama_cli_reference/download_models.md
@@ -129,3 +129,35 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
 
 > **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+
+## List the downloaded models
+
+To list the downloaded models with the following command:
+```
+llama model list --downloaded
+```
+
+You should see a table like this:
+```
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
+```
diff --git a/docs/source/references/llama_cli_reference/index.md b/docs/source/references/llama_cli_reference/index.md
index a43666963..8a38fc3ae 100644
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@@ -154,6 +154,38 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
 
 > **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
 
+## List the downloaded models
+
+To list the downloaded models with the following command:
+```
+llama model list --downloaded
+```
+
+You should see a table like this:
+```
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
+```
+
 
 ## Understand the models
 The `llama model` command helps you explore the model’s interface.
diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md
index 60556a6f3..87438fb6d 100644
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@@ -29,12 +29,31 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
-$ ls ~/.llama/checkpoints
-Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
-Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
+$ llama model list --downloaded
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 
 ## Running the Distribution
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
index 2b117120c..e8dfaaf3c 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@@ -31,12 +31,31 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
-$ ls ~/.llama/checkpoints
-Llama3.1-8B           Llama3.2-11B-Vision-Instruct  Llama3.2-1B-Instruct  Llama3.2-90B-Vision-Instruct  Llama-Guard-3-8B
-Llama3.1-8B-Instruct  Llama3.2-1B                   Llama3.2-3B-Instruct  Llama-Guard-3-1B              Prompt-Guard-86M
+$ llama model list --downloaded
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+┃ Model                                   ┃ Size     ┃ Modified Time       ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
+├─────────────────────────────────────────┼──────────┼─────────────────────┤
+│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
+└─────────────────────────────────────────┴──────────┴─────────────────────┘
 ```
 
 ## Running the Distribution

From 6520baebed13c1cbf4227f84d0dcd6e77bcf9ba7 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 28 Feb 2025 11:10:45 -0800
Subject: [PATCH 19/22] fix: replace eval with json decoding (#1327)

# What does this PR do?

- Using `eval` on server is a security risk
- Replace `eval` with `json.loads`

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
pytest -v -s --nbval-lax ./llama-stack/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
```
<img width="747" alt="image"
src="https://github.com/user-attachments/assets/7aff3d95-0b12-4394-b9d0-aeff791eee38"
/>


[//]: # (## Documentation)
---
 .../providers/inline/eval/meta_reference/eval.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 48157b018..a01f7f1f3 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import json
 from typing import Any, Dict, List, Optional
 
 from tqdm import tqdm
@@ -116,7 +117,7 @@ class MetaReferenceEvalImpl(
         generations = []
         for i, x in tqdm(enumerate(input_rows)):
             assert ColumnName.chat_completion_input.value in x, "Invalid input row"
-            input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
+            input_messages = json.loads(x[ColumnName.chat_completion_input.value])
             input_messages = [UserMessage(**x) for x in input_messages]
 
             # NOTE: only single-turn agent generation is supported. Create a new session for each input row
@@ -158,7 +159,7 @@ class MetaReferenceEvalImpl(
         generations = []
         for x in tqdm(input_rows):
             if ColumnName.completion_input.value in x:
-                input_content = eval(str(x[ColumnName.completion_input.value]))
+                input_content = json.loads(x[ColumnName.completion_input.value])
                 response = await self.inference_api.completion(
                     model=candidate.model,
                     content=input_content,
@@ -166,9 +167,8 @@ class MetaReferenceEvalImpl(
                 )
                 generations.append({ColumnName.generated_answer.value: response.completion_message.content})
             elif ColumnName.chat_completion_input.value in x:
-                chat_completion_input_str = str(x[ColumnName.chat_completion_input.value])
-                input_messages = eval(chat_completion_input_str)
-                input_messages = [UserMessage(**x) for x in input_messages]
+                chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
+                input_messages = [UserMessage(**x) for x in chat_completion_input_json]
                 messages = []
                 if candidate.system_message:
                     messages.append(candidate.system_message)

From 5547ef953c304858d80b1ffa6b0f8226c3aad497 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 28 Feb 2025 11:16:12 -0800
Subject: [PATCH 20/22] feat: enhance OpenAPI spec to include Error types
 (#1320)

# What does this PR do?

An API spec must talk about Error handling. This was a pretty glaring
omission so far. This PR begins to address it by adding a set of
standard error responses we can attach to all our API calls.

At a future point, we can add specific error types where necessary
(although we should not hurry to do that; it is best done very late.)

## Test Plan

Checked that Stainless SDK generation succeeds.
---
 docs/_static/llama-stack-spec.html            | 1076 ++++++++++++++++-
 docs/_static/llama-stack-spec.yaml            |  894 +++++++++++++-
 docs/openapi_generator/generate.py            |    1 +
 docs/openapi_generator/pyopenapi/generator.py |   82 ++
 docs/openapi_generator/pyopenapi/options.py   |    2 +
 llama_stack/apis/datatypes.py                 |   20 +
 6 files changed, 2073 insertions(+), 2 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 2a9f4b6f7..6b98cad90 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -52,6 +52,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -97,6 +109,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -128,6 +152,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -159,6 +195,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -183,6 +231,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -219,6 +279,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -255,6 +327,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -286,6 +370,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -317,6 +413,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -362,6 +470,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -410,6 +530,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -438,6 +570,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -462,6 +606,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -492,6 +648,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -532,6 +700,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -570,6 +750,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -608,6 +800,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -648,6 +852,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -679,6 +895,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -719,6 +947,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -773,6 +1013,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -826,6 +1078,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -863,6 +1127,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -884,6 +1160,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -921,6 +1209,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -942,6 +1242,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -979,6 +1291,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1016,6 +1340,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1046,6 +1382,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1084,6 +1432,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1124,6 +1484,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1154,6 +1526,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1175,6 +1559,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1205,6 +1601,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1242,6 +1650,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1279,6 +1699,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1309,6 +1741,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1337,6 +1781,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1373,6 +1829,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1422,6 +1890,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1443,6 +1923,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1473,6 +1965,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1487,6 +1991,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1511,6 +2027,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1542,6 +2070,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1580,6 +2120,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1609,6 +2161,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1647,6 +2211,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1685,6 +2261,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1697,6 +2285,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1728,6 +2328,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1740,6 +2352,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1771,6 +2395,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1802,6 +2438,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1821,6 +2469,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1852,6 +2512,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1873,6 +2545,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1894,6 +2578,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1932,6 +2628,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1944,6 +2652,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1975,6 +2695,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -1994,6 +2726,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2025,6 +2769,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2037,6 +2793,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2068,6 +2836,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2098,6 +2878,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2117,6 +2909,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2141,6 +2945,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2172,6 +2988,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2203,6 +3031,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2234,6 +3074,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2265,6 +3117,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2296,6 +3160,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2332,6 +3208,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2391,6 +3279,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2431,6 +3331,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2455,6 +3367,18 @@
                 "responses": {
                     "200": {
                         "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2486,6 +3410,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2517,6 +3453,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2548,6 +3496,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2579,6 +3539,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2610,6 +3582,18 @@
                                 }
                             }
                         }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
                     }
                 },
                 "tags": [
@@ -2623,6 +3607,35 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
+            "Error": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "integer",
+                        "description": "HTTP status code"
+                    },
+                    "title": {
+                        "type": "string",
+                        "description": "Error title, a short summary of the error which is invariant for an error type"
+                    },
+                    "detail": {
+                        "type": "string",
+                        "description": "Error detail, a longer human-readable description of the error"
+                    },
+                    "instance": {
+                        "type": "string",
+                        "description": "(Optional) A URL which can be used to retrieve more information about the specific occurrence of the error"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "status",
+                    "title",
+                    "detail"
+                ],
+                "title": "Error",
+                "description": "Error response from the API. Roughly follows RFC 7807."
+            },
             "AppendRowsRequest": {
                 "type": "object",
                 "properties": {
@@ -8741,7 +9754,68 @@
                 "title": "VersionInfo"
             }
         },
-        "responses": {}
+        "responses": {
+            "BadRequest400": {
+                "description": "The request was invalid or malformed",
+                "content": {
+                    "application/json": {
+                        "schema": {
+                            "$ref": "#/components/schemas/Error"
+                        },
+                        "example": {
+                            "status": 400,
+                            "title": "Bad Request",
+                            "detail": "The request was invalid or malformed"
+                        }
+                    }
+                }
+            },
+            "TooManyRequests429": {
+                "description": "The client has sent too many requests in a given amount of time",
+                "content": {
+                    "application/json": {
+                        "schema": {
+                            "$ref": "#/components/schemas/Error"
+                        },
+                        "example": {
+                            "status": 429,
+                            "title": "Too Many Requests",
+                            "detail": "You have exceeded the rate limit. Please try again later."
+                        }
+                    }
+                }
+            },
+            "InternalServerError500": {
+                "description": "The server encountered an unexpected error",
+                "content": {
+                    "application/json": {
+                        "schema": {
+                            "$ref": "#/components/schemas/Error"
+                        },
+                        "example": {
+                            "status": 500,
+                            "title": "Internal Server Error",
+                            "detail": "An unexpected error occurred. Our team has been notified."
+                        }
+                    }
+                }
+            },
+            "DefaultError": {
+                "description": "An unexpected error occurred",
+                "content": {
+                    "application/json": {
+                        "schema": {
+                            "$ref": "#/components/schemas/Error"
+                        },
+                        "example": {
+                            "status": 0,
+                            "title": "Error",
+                            "detail": "An unexpected error occurred"
+                        }
+                    }
+                }
+            }
+        }
     },
     "security": [
         {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a2329e47a..13f7edc4b 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -19,6 +19,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/PaginatedRowsResult'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - DatasetIO
       description: ''
@@ -47,6 +57,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - DatasetIO
       description: ''
@@ -66,6 +86,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/BatchChatCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - BatchInference (Coming Soon)
       description: ''
@@ -85,6 +115,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/BatchCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - BatchInference (Coming Soon)
       description: ''
@@ -100,6 +140,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - PostTraining (Coming Soon)
       description: ''
@@ -124,6 +174,16 @@ paths:
             text/event-stream:
               schema:
                 $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inference
       description: >-
@@ -149,6 +209,16 @@ paths:
             text/event-stream:
               schema:
                 $ref: '#/components/schemas/CompletionResponseStreamChunk'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inference
       description: >-
@@ -169,6 +239,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/AgentCreateResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -188,6 +268,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/AgentSessionCreateResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -217,6 +307,16 @@ paths:
             text/event-stream:
               schema:
                 $ref: '#/components/schemas/AgentTurnResponseStreamChunk'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -246,6 +346,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListBucketResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: List all buckets.
@@ -263,6 +373,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/FileUploadResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: >-
@@ -279,6 +399,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -297,6 +427,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Session'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -322,6 +462,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -345,6 +495,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/FileResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: >-
@@ -371,6 +531,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/FileResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: >-
@@ -401,6 +571,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/EmbeddingsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inference
       description: >-
@@ -421,6 +601,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/EvaluateResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Eval
       description: ''
@@ -445,6 +635,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/AgentStepResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -478,6 +678,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Turn'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: ''
@@ -508,6 +718,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/Benchmark'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
       description: ''
@@ -528,6 +748,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/Dataset'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Datasets
       description: ''
@@ -541,6 +771,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Datasets
       description: ''
@@ -561,6 +801,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/Model'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Models
       description: ''
@@ -574,6 +824,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Models
       description: ''
@@ -594,6 +854,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/ScoringFn'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ScoringFunctions
       description: ''
@@ -614,6 +884,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/Shield'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Shields
       description: ''
@@ -632,6 +912,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Span'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -655,6 +945,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/QuerySpanTreeResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -679,6 +979,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Tool'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolGroups
       description: ''
@@ -697,6 +1007,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ToolGroup'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolGroups
       description: ''
@@ -710,6 +1030,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolGroups
       description: Unregister a tool group
@@ -728,6 +1058,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Trace'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -748,6 +1088,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/PostTrainingJobArtifactsResponse'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - PostTraining (Coming Soon)
       description: ''
@@ -768,6 +1118,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/PostTrainingJobStatusResponse'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - PostTraining (Coming Soon)
       description: ''
@@ -786,6 +1146,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListPostTrainingJobsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - PostTraining (Coming Soon)
       description: ''
@@ -801,6 +1171,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/FileUploadResponse'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: >-
@@ -822,6 +1202,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/FileResponse'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: >-
@@ -852,6 +1242,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/VectorDB'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - VectorDBs
       description: ''
@@ -865,6 +1265,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - VectorDBs
       description: ''
@@ -883,6 +1293,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/HealthInfo'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inspect
       description: ''
@@ -892,6 +1312,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolRuntime
       description: >-
@@ -908,6 +1338,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - VectorIO
       description: ''
@@ -927,6 +1367,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ToolInvocationResult'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolRuntime
       description: Run a tool with the given arguments
@@ -948,6 +1398,16 @@ paths:
                 oneOf:
                   - $ref: '#/components/schemas/JobStatus'
                   - type: 'null'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Eval
       description: ''
@@ -966,6 +1426,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Eval
       description: ''
@@ -989,6 +1459,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/EvaluateResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Eval
       description: ''
@@ -1012,6 +1492,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListBenchmarksResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
       description: ''
@@ -1020,6 +1510,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
       description: ''
@@ -1039,6 +1539,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListDatasetsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Datasets
       description: ''
@@ -1047,6 +1557,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Datasets
       description: ''
@@ -1066,6 +1586,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListFileResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Files (Coming Soon)
       description: List all files in a bucket.
@@ -1085,6 +1615,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListModelsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Models
       description: ''
@@ -1097,6 +1637,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Model'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Models
       description: ''
@@ -1116,6 +1666,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListProvidersResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inspect
       description: ''
@@ -1129,6 +1689,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListRoutesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inspect
       description: ''
@@ -1142,6 +1712,16 @@ paths:
             application/jsonl:
               schema:
                 $ref: '#/components/schemas/ToolDef'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolRuntime
       description: ''
@@ -1165,6 +1745,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListScoringFunctionsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ScoringFunctions
       description: ''
@@ -1173,6 +1763,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ScoringFunctions
       description: ''
@@ -1192,6 +1792,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListShieldsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Shields
       description: ''
@@ -1204,6 +1814,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Shield'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Shields
       description: ''
@@ -1223,6 +1843,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListToolGroupsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolGroups
       description: List tool groups with optional provider
@@ -1231,6 +1861,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolGroups
       description: Register a tool group
@@ -1250,6 +1890,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListToolsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolGroups
       description: List tools with optional tool group
@@ -1268,6 +1918,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ListVectorDBsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - VectorDBs
       description: ''
@@ -1280,6 +1940,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/VectorDB'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - VectorDBs
       description: ''
@@ -1295,6 +1965,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -1314,6 +1994,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/PostTrainingJob'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - PostTraining (Coming Soon)
       description: ''
@@ -1333,6 +2023,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/RAGQueryResult'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - ToolRuntime
       description: >-
@@ -1353,6 +2053,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/QueryChunksResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - VectorIO
       description: ''
@@ -1372,6 +2082,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/QuerySpansResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -1391,6 +2111,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/QueryTracesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -1415,6 +2145,16 @@ paths:
             text/event-stream:
               schema:
                 $ref: '#/components/schemas/AgentTurnResponseStreamChunk'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Agents
       description: >-
@@ -1457,6 +2197,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/Job'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Eval
       description: ''
@@ -1481,6 +2231,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/RunShieldResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Safety
       description: ''
@@ -1496,6 +2256,16 @@ paths:
       responses:
         '200':
           description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Telemetry
       description: ''
@@ -1515,6 +2285,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ScoreResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Scoring
       description: ''
@@ -1534,6 +2314,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/ScoreBatchResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Scoring
       description: ''
@@ -1553,6 +2343,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/PostTrainingJob'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - PostTraining (Coming Soon)
       description: ''
@@ -1572,6 +2372,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/SyntheticDataGenerationResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - SyntheticDataGeneration (Coming Soon)
       description: ''
@@ -1591,6 +2401,16 @@ paths:
             application/json:
               schema:
                 $ref: '#/components/schemas/VersionInfo'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
       tags:
         - Inspect
       description: ''
@@ -1599,6 +2419,34 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
+    Error:
+      type: object
+      properties:
+        status:
+          type: integer
+          description: HTTP status code
+        title:
+          type: string
+          description: >-
+            Error title, a short summary of the error which is invariant for an error
+            type
+        detail:
+          type: string
+          description: >-
+            Error detail, a longer human-readable description of the error
+        instance:
+          type: string
+          description: >-
+            (Optional) A URL which can be used to retrieve more information about
+            the specific occurrence of the error
+      additionalProperties: false
+      required:
+        - status
+        - title
+        - detail
+      title: Error
+      description: >-
+        Error response from the API. Roughly follows RFC 7807.
     AppendRowsRequest:
       type: object
       properties:
@@ -5626,7 +6474,51 @@ components:
       required:
         - version
       title: VersionInfo
-  responses: {}
+  responses:
+    BadRequest400:
+      description: The request was invalid or malformed
+      content:
+        application/json:
+          schema:
+            $ref: '#/components/schemas/Error'
+          example:
+            status: 400
+            title: Bad Request
+            detail: The request was invalid or malformed
+    TooManyRequests429:
+      description: >-
+        The client has sent too many requests in a given amount of time
+      content:
+        application/json:
+          schema:
+            $ref: '#/components/schemas/Error'
+          example:
+            status: 429
+            title: Too Many Requests
+            detail: >-
+              You have exceeded the rate limit. Please try again later.
+    InternalServerError500:
+      description: >-
+        The server encountered an unexpected error
+      content:
+        application/json:
+          schema:
+            $ref: '#/components/schemas/Error'
+          example:
+            status: 500
+            title: Internal Server Error
+            detail: >-
+              An unexpected error occurred. Our team has been notified.
+    DefaultError:
+      description: An unexpected error occurred
+      content:
+        application/json:
+          schema:
+            $ref: '#/components/schemas/Error'
+          example:
+            status: 0
+            title: Error
+            detail: An unexpected error occurred
 security:
   - Default: []
 tags:
diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index dcbee7d2f..a2553f905 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -55,6 +55,7 @@ def main(output_dir: str):
                 a set of endpoints and their corresponding interfaces that are tailored to
                 best leverage Llama Models.""",
             ),
+            include_standard_error_responses=True,
         ),
     )
 
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 4220cfc05..91f32e6c8 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -10,6 +10,7 @@ import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
 
+from llama_stack.apis.datatypes import Error
 from llama_stack.strong_typing.core import JsonType
 from llama_stack.strong_typing.docstring import Docstring, parse_type
 from llama_stack.strong_typing.inspection import (
@@ -434,6 +435,75 @@ class Generator:
         )
         self.schema_builder = SchemaBuilder(schema_generator)
         self.responses = {}
+        
+        # Create standard error responses
+        self._create_standard_error_responses()
+
+    def _create_standard_error_responses(self) -> None:
+        """
+        Creates standard error responses that can be reused across operations.
+        These will be added to the components.responses section of the OpenAPI document.
+        """
+        # Get the Error schema
+        error_schema = self.schema_builder.classdef_to_ref(Error)
+        
+        # Create standard error responses
+        self.responses["BadRequest400"] = Response(
+            description="The request was invalid or malformed",
+            content={
+                "application/json": MediaType(
+                    schema=error_schema,
+                    example={
+                        "status": 400,
+                        "title": "Bad Request",
+                        "detail": "The request was invalid or malformed",
+                    }
+                )
+            }
+        )
+        
+        self.responses["TooManyRequests429"] = Response(
+            description="The client has sent too many requests in a given amount of time",
+            content={
+                "application/json": MediaType(
+                    schema=error_schema,
+                    example={
+                        "status": 429,
+                        "title": "Too Many Requests",
+                        "detail": "You have exceeded the rate limit. Please try again later.",
+                    }
+                )
+            }
+        )
+        
+        self.responses["InternalServerError500"] = Response(
+            description="The server encountered an unexpected error",
+            content={
+                "application/json": MediaType(
+                    schema=error_schema,
+                    example={
+                        "status": 500,
+                        "title": "Internal Server Error",
+                        "detail": "An unexpected error occurred. Our team has been notified.",
+                    }
+                )
+            }
+        )
+        
+        # Add a default error response for any unhandled error cases
+        self.responses["DefaultError"] = Response(
+            description="An unexpected error occurred",
+            content={
+                "application/json": MediaType(
+                    schema=error_schema,
+                    example={
+                        "status": 0,
+                        "title": "Error",
+                        "detail": "An unexpected error occurred",
+                    }
+                )
+            }
+        )
 
     def _build_type_tag(self, ref: str, schema: Schema) -> Tag:
         # Don't include schema definition in the tag description because for one,
@@ -649,6 +719,18 @@ class Generator:
             responses.update(response_builder.build_response(response_options))
 
         assert len(responses.keys()) > 0, f"No responses found for {op.name}"
+        
+        # Add standard error response references
+        if self.options.include_standard_error_responses:
+            if "400" not in responses:
+                responses["400"] = ResponseRef("BadRequest400")
+            if "429" not in responses:
+                responses["429"] = ResponseRef("TooManyRequests429")
+            if "500" not in responses:
+                responses["500"] = ResponseRef("InternalServerError500")
+            if "default" not in responses:
+                responses["default"] = ResponseRef("DefaultError")
+        
         if op.event_type is not None:
             builder = ContentBuilder(self.schema_builder)
             callbacks = {
diff --git a/docs/openapi_generator/pyopenapi/options.py b/docs/openapi_generator/pyopenapi/options.py
index f80da453b..edc861ad5 100644
--- a/docs/openapi_generator/pyopenapi/options.py
+++ b/docs/openapi_generator/pyopenapi/options.py
@@ -35,6 +35,7 @@ class Options:
     :param error_wrapper: True if errors are encapsulated in an error object wrapper.
     :param property_description_fun: Custom transformation function to apply to class property documentation strings.
     :param captions: User-defined captions for sections such as "Operations" or "Types", and (if applicable) groups of extra types.
+    :param include_standard_error_responses: Whether to include standard error responses (400, 429, 500, 503) in all operations.
     """
 
     server: Server
@@ -52,6 +53,7 @@ class Options:
     error_wrapper: bool = False
     property_description_fun: Optional[Callable[[type, str, str], str]] = None
     captions: Optional[Dict[str, str]] = None
+    include_standard_error_responses: bool = True
 
     default_captions: ClassVar[Dict[str, str]] = {
         "Operations": "Operations",
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index 6df93052c..842a2b63d 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -5,6 +5,9 @@
 # the root directory of this source tree.
 
 from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel
 
 from llama_stack.schema_utils import json_schema_type
 
@@ -33,3 +36,20 @@ class Api(Enum):
 
     # built-in API
     inspect = "inspect"
+
+
+@json_schema_type
+class Error(BaseModel):
+    """
+    Error response from the API. Roughly follows RFC 7807.
+
+    :param status: HTTP status code
+    :param title: Error title, a short summary of the error which is invariant for an error type
+    :param detail: Error detail, a longer human-readable description of the error
+    :param instance: (Optional) A URL which can be used to retrieve more information about the specific occurrence of the error
+    """
+
+    status: int
+    title: str
+    detail: str
+    instance: Optional[str] = None

From 15f69e75ffaf07c79edf1cdcef1c31d0b67bbc3d Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 28 Feb 2025 11:25:23 -0800
Subject: [PATCH 21/22] fix: replace eval with json decoding for format_adapter
 (#1328)

# What does this PR do?
- using `eval` is a security risk

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

- see https://github.com/meta-llama/llama-stack/pull/1327

cc @SLR722 we will need to update the corresponding dataset via

```python
def update_to_json_str():

dataset = datasets.load_dataset(...)
processed_dataset = dataset[split].map(
        lambda x: {
                "column": json.dumps(eval(x["column"]))
       }
)
processed_dataset.push_to_hub(...)
```
[//]: # (## Documentation)
---
 .../post_training/torchtune/datasets/format_adapter.py   | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py b/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
index 884977803..6b607f1c7 100644
--- a/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
+++ b/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
@@ -10,16 +10,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import json
 from typing import Any, Mapping
 
 from llama_stack.providers.utils.common.data_schema_validator import ColumnName
 
 
-def llama_stack_instruct_to_torchtune_instruct(sample: Mapping[str, Any]) -> Mapping[str, Any]:
+def llama_stack_instruct_to_torchtune_instruct(
+    sample: Mapping[str, Any],
+) -> Mapping[str, Any]:
     assert ColumnName.chat_completion_input.value in sample and ColumnName.expected_answer.value in sample, (
         "Invalid input row"
     )
-    input_messages = eval(str(sample[ColumnName.chat_completion_input.value]))
+    input_messages = json.loads(sample[ColumnName.chat_completion_input.value])
 
     assert len(input_messages) == 1, "llama stack intruct dataset format only supports 1 user message"
     input_message = input_messages[0]
@@ -37,7 +40,7 @@ def llama_stack_instruct_to_torchtune_instruct(sample: Mapping[str, Any]) -> Map
 def llama_stack_chat_to_torchtune_chat(sample: Mapping[str, Any]) -> Mapping[str, Any]:
     assert ColumnName.dialog.value in sample, "Invalid input row"
     role_map = {"user": "human", "assistant": "gpt"}
-    dialog = eval(str(sample[ColumnName.dialog.value]))
+    dialog = json.loads(sample[ColumnName.dialog.value])
 
     assert len(dialog) > 1, "dialog must have at least 2 messagse"
     roles = []

From 82fa0803faee41ae0e74a5e97066cdb78bfee294 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 28 Feb 2025 12:29:50 -0800
Subject: [PATCH 22/22] chore: refactor client tool in test (#1331)

# What does this PR do?

Use @client_tool decorator instead of ClientTool

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/client-sdk/agents/test_agents.py --inference-model "meta-llama/Llama-3.3-70B-Instruct"
```

<img width="1053" alt="image"
src="https://github.com/user-attachments/assets/d3ade884-ef42-494e-8028-3b09d9ef1978"
/>


[//]: # (## Documentation)
---
 tests/client-sdk/agents/test_agents.py | 82 ++++++--------------------
 1 file changed, 18 insertions(+), 64 deletions(-)

diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py
index 8f68699b2..9690a8139 100644
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@@ -4,20 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import json
-from typing import Dict, List
 from uuid import uuid4
 
 import pytest
 from llama_stack_client.lib.agents.agent import Agent
-from llama_stack_client.lib.agents.client_tool import ClientTool
+from llama_stack_client.lib.agents.client_tool import client_tool
 from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types import ToolResponseMessage
 from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument
 from llama_stack_client.types.memory_insert_params import Document
-from llama_stack_client.types.shared.completion_message import CompletionMessage
 from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig
-from llama_stack_client.types.tool_def_param import Parameter
 
 from llama_stack.apis.agents.agents import (
     AgentConfig as Server__AgentConfig,
@@ -27,63 +22,22 @@ from llama_stack.apis.agents.agents import (
 )
 
 
-class TestClientTool(ClientTool):
-    """Tool to give boiling point of a liquid
-    Returns the correct value for polyjuice in Celcius and Fahrenheit
-    and returns -1 for other liquids
+@client_tool
+def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
     """
+    Returns the boiling point of a liquid in Celcius or Fahrenheit
 
-    def run(self, messages: List[CompletionMessage]) -> List[ToolResponseMessage]:
-        assert len(messages) == 1, "Expected single message"
-
-        message = messages[0]
-
-        tool_call = message.tool_calls[0]
-
-        try:
-            response = self.run_impl(**tool_call.arguments)
-            response_str = json.dumps(response, ensure_ascii=False)
-        except Exception as e:
-            response_str = f"Error when running tool: {e}"
-
-        message = ToolResponseMessage(
-            role="tool",
-            call_id=tool_call.call_id,
-            tool_name=tool_call.tool_name,
-            content=response_str,
-        )
-        return message
-
-    def get_name(self) -> str:
-        return "get_boiling_point"
-
-    def get_description(self) -> str:
-        return "Get the boiling point of imaginary liquids (eg. polyjuice)"
-
-    def get_params_definition(self) -> Dict[str, Parameter]:
-        return {
-            "liquid_name": Parameter(
-                name="liquid_name",
-                parameter_type="string",
-                description="The name of the liquid",
-                required=True,
-            ),
-            "celcius": Parameter(
-                name="celcius",
-                parameter_type="boolean",
-                description="Whether to return the boiling point in Celcius",
-                required=False,
-            ),
-        }
-
-    def run_impl(self, liquid_name: str, celcius: bool = True) -> int:
-        if liquid_name.lower() == "polyjuice":
-            if celcius:
-                return -100
-            else:
-                return -212
+    :param liquid_name: The name of the liquid
+    :param celcius: Whether to return the boiling point in Celcius
+    :return: The boiling point of the liquid in Celcius or Fahrenheit
+    """
+    if liquid_name.lower() == "polyjuice":
+        if celcius:
+            return -100
         else:
-            return -1
+            return -212
+    else:
+        return -1
 
 
 @pytest.fixture(scope="session")
@@ -298,7 +252,7 @@ def test_code_interpreter_for_attachments(llama_stack_client, agent_config):
 
 
 def test_custom_tool(llama_stack_client, agent_config):
-    client_tool = TestClientTool()
+    client_tool = get_boiling_point
     agent_config = {
         **agent_config,
         "toolgroups": ["builtin::websearch"],
@@ -326,7 +280,7 @@ def test_custom_tool(llama_stack_client, agent_config):
 
 def test_tool_choice(llama_stack_client, agent_config):
     def run_agent(tool_choice):
-        client_tool = TestClientTool()
+        client_tool = get_boiling_point
 
         test_agent_config = {
             **agent_config,
@@ -362,7 +316,7 @@ def test_tool_choice(llama_stack_client, agent_config):
 
 # TODO: fix this flaky test
 def xtest_override_system_message_behavior(llama_stack_client, agent_config):
-    client_tool = TestClientTool()
+    client_tool = get_boiling_point
     agent_config = {
         **agent_config,
         "instructions": "You are a pirate",
@@ -586,7 +540,7 @@ def test_rag_and_code_agent(llama_stack_client, agent_config):
 
 
 def test_create_turn_response(llama_stack_client, agent_config):
-    client_tool = TestClientTool()
+    client_tool = get_boiling_point
     agent_config = {
         **agent_config,
         "input_shields": [],