From 43e623eea6c893a06bb03f1decdeadf07167bb67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Mon, 12 May 2025 19:54:43 +0200 Subject: [PATCH 01/11] chore: remove last instances of code-interpreter provider (#2143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Was removed in https://github.com/meta-llama/llama-stack/pull/2087 Signed-off-by: Sébastien Han --- docs/getting_started.ipynb | 3 -- .../Llama_Stack_Benchmark_Evals.ipynb | 14 ---------- docs/source/building_applications/tools.md | 28 ------------------- .../distributions/self_hosted_distro/dell.md | 2 +- .../llama_stack_client_cli_reference.md | 2 -- .../llama-stack-provider-ollama/run.yaml | 5 ---- .../openai-api-verification-run.yaml | 5 ---- 7 files changed, 1 insertion(+), 58 deletions(-) diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index b764d4d34..cdaf074b8 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -1050,8 +1050,6 @@ "text/html": [ "
ToolGroup(\n",
               "identifier='builtin::code_interpreter',\n",
-              "provider_id='code-interpreter',\n",
-              "provider_resource_id='builtin::code_interpreter',\n",
               "type='tool_group',\n",
               "args=None,\n",
               "mcp_endpoint=None\n",
@@ -1061,7 +1059,6 @@
             "text/plain": [
               "\u001b[1;35mToolGroup\u001b[0m\u001b[1m(\u001b[0m\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33midentifier\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
-              "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_id\u001b[0m=\u001b[32m'code-interpreter'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_resource_id\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'tool_group'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33margs\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 5de7f715e..413b693d1 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -337,9 +337,6 @@
               "    provider_id: tavily-search\n",
               "    provider_type: remote::tavily-search\n",
               "  - config: {}\n",
-              "    provider_id: code-interpreter\n",
-              "    provider_type: inline::code-interpreter\n",
-              "  - config: {}\n",
               "    provider_id: rag-runtime\n",
               "    provider_type: inline::rag-runtime\n",
               "  - config: {}\n",
@@ -378,10 +375,6 @@
               "  toolgroup_id: builtin::rag\n",
               "- args: null\n",
               "  mcp_endpoint: null\n",
-              "  provider_id: code-interpreter\n",
-              "  toolgroup_id: builtin::code_interpreter\n",
-              "- args: null\n",
-              "  mcp_endpoint: null\n",
               "  provider_id: wolfram-alpha\n",
               "  toolgroup_id: builtin::wolfram_alpha\n",
               "vector_dbs: []\n",
@@ -617,9 +610,6 @@
               "    provider_id: tavily-search\n",
               "    provider_type: remote::tavily-search\n",
               "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-              "    provider_id: code-interpreter\n",
-              "    provider_type: inlin\u001b[1;92me::c\u001b[0mode-interpreter\n",
-              "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
               "    provider_id: rag-runtime\n",
               "    provider_type: inline::rag-runtime\n",
               "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
@@ -658,10 +648,6 @@
               "  toolgroup_id: builtin::rag\n",
               "- args: null\n",
               "  mcp_endpoint: null\n",
-              "  provider_id: code-interpreter\n",
-              "  toolgroup_id: builtin::code_interpreter\n",
-              "- args: null\n",
-              "  mcp_endpoint: null\n",
               "  provider_id: wolfram-alpha\n",
               "  toolgroup_id: builtin::wolfram_alpha\n",
               "vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
diff --git a/docs/source/building_applications/tools.md b/docs/source/building_applications/tools.md
index 95c69ffa3..c7af17bfa 100644
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@@ -165,34 +165,6 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```
 
-## Simple Example: Using an Agent with the Code-Interpreter Tool
-
-```python
-from llama_stack_client import Agent
-
-# Instantiate the AI agent with the given configuration
-agent = Agent(
-    client,
-    name="code-interpreter",
-    description="A code interpreter agent for executing Python code snippets",
-    instructions="""
-    You are a highly reliable, concise, and precise assistant.
-    Always show the generated code, never generate your own code, and never anticipate results.
-    """,
-    model="meta-llama/Llama-3.2-3B-Instruct",
-    tools=["builtin::code_interpreter"],
-    max_infer_iters=5,
-)
-
-# Start a session
-session_id = agent.create_session("tool_session")
-
-# Send a query to the AI agent for code execution
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "Run this code: print(3 ** 4 - 5 * 2)"}],
-    session_id=session_id,
-)
-```
 ## Simple Example 2: Using an Agent with the Web Search Tool
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
 2. [Optional] Provide the API key directly to the Llama Stack server
diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md
index 96b0ef478..2e987985c 100644
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@@ -23,7 +23,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index 0b84027f0..cd4dd4cd7 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -253,8 +253,6 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
-| builtin::code_interpreter | code-interpreter | None | None          |
-+---------------------------+------------------+------+---------------+
 | builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
 | builtin::websearch        | tavily-search    | None | None          |
diff --git a/tests/external-provider/llama-stack-provider-ollama/run.yaml b/tests/external-provider/llama-stack-provider-ollama/run.yaml
index 5afeb1448..666189f03 100644
--- a/tests/external-provider/llama-stack-provider-ollama/run.yaml
+++ b/tests/external-provider/llama-stack-provider-ollama/run.yaml
@@ -53,9 +53,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -90,8 +87,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/tests/verifications/openai-api-verification-run.yaml b/tests/verifications/openai-api-verification-run.yaml
index d80aa3c75..4c322af28 100644
--- a/tests/verifications/openai-api-verification-run.yaml
+++ b/tests/verifications/openai-api-verification-run.yaml
@@ -74,9 +74,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -156,8 +153,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:

From 53b7f50828f410940552c5cd2efee5420f35761c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= 
Date: Mon, 12 May 2025 19:55:39 +0200
Subject: [PATCH 02/11] chore: force ellipsis in API webmethods (#2141)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

This new check will fail if some webmethods are missing the ellipsis:

```
API Method Return Type Validation Errors:

Method Api.eval.job_result does not contain ellipsis (...) in its implementation
Method Api.agents.create_agent_turn does not contain ellipsis (...) in its implementation
Method Api.agents.create_openai_response does not contain ellipsis (...) in its implementation
Method Api.eval.evaluate_rows does not contain ellipsis (...) in its implementation
Method Api.eval.run_eval does not contain ellipsis (...) in its implementation
```

Unless not implemented.

Signed-off-by: Sébastien Han 
---
 docs/openapi_generator/generate.py          |  2 +-
 docs/openapi_generator/pyopenapi/utility.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index caa4f17ff..9fc375175 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -44,7 +44,7 @@ def main(output_dir: str):
     if return_type_errors:
         print("\nAPI Method Return Type Validation Errors:\n")
         for error in return_type_errors:
-            print(error)
+            print(error, file=sys.stderr)
         sys.exit(1)
     now = str(datetime.now())
     print(
diff --git a/docs/openapi_generator/pyopenapi/utility.py b/docs/openapi_generator/pyopenapi/utility.py
index db18e8430..9bd3cd2dd 100644
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@@ -174,14 +174,25 @@ def _validate_list_parameters_contain_data(method) -> str | None:
         return "does not have a mandatory data attribute containing the list of objects"
 
 
+def _validate_has_ellipsis(method) -> str | None:
+    source = inspect.getsource(method)
+    if "..." not in source and not "NotImplementedError" in source:
+        return "does not contain ellipsis (...) in its implementation"
+
+
 _VALIDATORS = {
     "GET": [
         _validate_api_method_return_type,
         _validate_list_parameters_contain_data,
         _validate_api_method_doesnt_return_list,
+        _validate_has_ellipsis,
     ],
     "DELETE": [
         _validate_api_delete_method_returns_none,
+        _validate_has_ellipsis,
+    ],
+    "POST": [
+        _validate_has_ellipsis,
     ],
 }
 

From 80c349965f1e434976d8425c41636ce0d4713a98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= 
Date: Mon, 12 May 2025 19:56:14 +0200
Subject: [PATCH 03/11] chore(refact): move paginate_records fn outside of
 datasetio (#2137)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Move under utils.

Signed-off-by: Sébastien Han 
---
 llama_stack/providers/inline/agents/meta_reference/agents.py    | 2 +-
 llama_stack/providers/inline/datasetio/localfs/datasetio.py     | 2 +-
 .../providers/remote/datasetio/huggingface/huggingface.py       | 2 +-
 llama_stack/providers/utils/{datasetio => }/pagination.py       | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename llama_stack/providers/utils/{datasetio => }/pagination.py (100%)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index 19d60c816..db10972de 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -37,8 +37,8 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .agent_instance import ChatAgent
 from .config import MetaReferenceAgentsImplConfig
diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
index 5640df6ab..da71ecb17 100644
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@@ -11,9 +11,9 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .config import LocalFSDatasetIOConfig
 
diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
index e329c88a7..fafd1d8ff 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@@ -12,8 +12,8 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .config import HuggingfaceDatasetIOConfig
 
diff --git a/llama_stack/providers/utils/datasetio/pagination.py b/llama_stack/providers/utils/pagination.py
similarity index 100%
rename from llama_stack/providers/utils/datasetio/pagination.py
rename to llama_stack/providers/utils/pagination.py

From 136e6b3cf71f4da30979cb770fc956435717301a Mon Sep 17 00:00:00 2001
From: Ben Browning 
Date: Mon, 12 May 2025 13:57:53 -0400
Subject: [PATCH 04/11] fix: ollama openai completion and chat completion
 params (#2125)

# What does this PR do?

The ollama provider was using an older variant of the code to convert
incoming parameters from the OpenAI API completions and chat completion
endpoints into requests that get sent to the backend provider over its
own OpenAI client. This updates it to use the common
`prepare_openai_completion_params` method used elsewhere, which takes
care of removing stray `None` values even for nested structures.

Without this, some other parameters, even if they have values of `None`,
make their way to ollama and actually influence its inference output as
opposed to when those parameters are not sent at all.

## Test Plan

This passes tests/integration/inference/test_openai_completion.py and
fixes the issue found in #2098, which was tested via manual curl
requests crafted a particular way.

Closes #2098

Signed-off-by: Ben Browning 
---
 .../remote/inference/ollama/ollama.py         | 97 +++++++++----------
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 32e2b17d0..72cf0d129 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -61,6 +61,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
     OpenAICompatCompletionChoice,
     OpenAICompatCompletionResponse,
     get_sampling_options,
+    prepare_openai_completion_params,
     process_chat_completion_response,
     process_chat_completion_stream_response,
     process_completion_response,
@@ -395,29 +396,25 @@ class OllamaInferenceAdapter(
             raise ValueError("Ollama does not support non-string prompts for completion")
 
         model_obj = await self._get_model(model)
-        params = {
-            k: v
-            for k, v in {
-                "model": model_obj.provider_resource_id,
-                "prompt": prompt,
-                "best_of": best_of,
-                "echo": echo,
-                "frequency_penalty": frequency_penalty,
-                "logit_bias": logit_bias,
-                "logprobs": logprobs,
-                "max_tokens": max_tokens,
-                "n": n,
-                "presence_penalty": presence_penalty,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream,
-                "stream_options": stream_options,
-                "temperature": temperature,
-                "top_p": top_p,
-                "user": user,
-            }.items()
-            if v is not None
-        }
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
         return await self.openai_client.completions.create(**params)  # type: ignore
 
     async def openai_chat_completion(
@@ -447,35 +444,31 @@ class OllamaInferenceAdapter(
         user: str | None = None,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self._get_model(model)
-        params = {
-            k: v
-            for k, v in {
-                "model": model_obj.provider_resource_id,
-                "messages": messages,
-                "frequency_penalty": frequency_penalty,
-                "function_call": function_call,
-                "functions": functions,
-                "logit_bias": logit_bias,
-                "logprobs": logprobs,
-                "max_completion_tokens": max_completion_tokens,
-                "max_tokens": max_tokens,
-                "n": n,
-                "parallel_tool_calls": parallel_tool_calls,
-                "presence_penalty": presence_penalty,
-                "response_format": response_format,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream,
-                "stream_options": stream_options,
-                "temperature": temperature,
-                "tool_choice": tool_choice,
-                "tools": tools,
-                "top_logprobs": top_logprobs,
-                "top_p": top_p,
-                "user": user,
-            }.items()
-            if v is not None
-        }
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
         return await self.openai_client.chat.completions.create(**params)  # type: ignore
 
     async def batch_completion(

From c985ea6326a9b429057d0e12b21ec0691074da35 Mon Sep 17 00:00:00 2001
From: Divya <117009486+divyaruhil@users.noreply.github.com>
Date: Mon, 12 May 2025 23:28:22 +0530
Subject: [PATCH 05/11] fix: Adding Embedding model to watsonx inference
 (#2118)

# What does this PR do?
Issue Link : https://github.com/meta-llama/llama-stack/issues/2117

## Test Plan
Once added, User will be able to use Sentence Transformer model
`all-MiniLM-L6-v2`
---
 .../remote_hosted_distro/watsonx.md           |  2 +-
 llama_stack/templates/dependencies.json       |  4 ++-
 llama_stack/templates/watsonx/build.yaml      |  1 +
 llama_stack/templates/watsonx/run.yaml        |  8 ++++++
 llama_stack/templates/watsonx/watsonx.py      | 27 ++++++++++++++++---
 5 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/docs/source/distributions/remote_hosted_distro/watsonx.md b/docs/source/distributions/remote_hosted_distro/watsonx.md
index b7c89e9b0..d8d327bb5 100644
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@@ -18,7 +18,7 @@ The `llamastack/distribution-watsonx` distribution consists of the following pro
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
-| inference | `remote::watsonx` |
+| inference | `remote::watsonx`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json
index 31f2b93f1..35cbc8878 100644
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@@ -833,6 +833,8 @@
     "tqdm",
     "transformers",
     "tree_sitter",
-    "uvicorn"
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ]
 }
diff --git a/llama_stack/templates/watsonx/build.yaml b/llama_stack/templates/watsonx/build.yaml
index 23a1ffa74..638b16029 100644
--- a/llama_stack/templates/watsonx/build.yaml
+++ b/llama_stack/templates/watsonx/build.yaml
@@ -4,6 +4,7 @@ distribution_spec:
   providers:
     inference:
     - remote::watsonx
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     safety:
diff --git a/llama_stack/templates/watsonx/run.yaml b/llama_stack/templates/watsonx/run.yaml
index 82d3b2c6e..50904b7e9 100644
--- a/llama_stack/templates/watsonx/run.yaml
+++ b/llama_stack/templates/watsonx/run.yaml
@@ -18,6 +18,9 @@ providers:
       url: ${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}
       api_key: ${env.WATSONX_API_KEY:}
       project_id: ${env.WATSONX_PROJECT_ID:}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -191,6 +194,11 @@ models:
   provider_id: watsonx
   provider_model_id: meta-llama/llama-guard-3-11b-vision
   model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields: []
 vector_dbs: []
 datasets: []
diff --git a/llama_stack/templates/watsonx/watsonx.py b/llama_stack/templates/watsonx/watsonx.py
index f16593051..802aaf8f1 100644
--- a/llama_stack/templates/watsonx/watsonx.py
+++ b/llama_stack/templates/watsonx/watsonx.py
@@ -6,7 +6,11 @@
 
 from pathlib import Path
 
-from llama_stack.distribution.datatypes import Provider, ToolGroupInput
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
 from llama_stack.providers.remote.inference.watsonx import WatsonXConfig
 from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
@@ -14,7 +18,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["remote::watsonx"],
+        "inference": ["remote::watsonx", "inline::sentence-transformers"],
         "vector_io": ["inline::faiss"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
@@ -36,6 +40,12 @@ def get_distribution_template() -> DistributionTemplate:
         config=WatsonXConfig.sample_run_config(),
     )
 
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+
     available_models = {
         "watsonx": MODEL_ENTRIES,
     }
@@ -50,6 +60,15 @@ def get_distribution_template() -> DistributionTemplate:
         ),
     ]
 
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+
     default_models = get_model_registry(available_models)
     return DistributionTemplate(
         name="watsonx",
@@ -62,9 +81,9 @@ def get_distribution_template() -> DistributionTemplate:
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
-                    "inference": [inference_provider],
+                    "inference": [inference_provider, embedding_provider],
                 },
-                default_models=default_models,
+                default_models=default_models + [embedding_model],
                 default_tool_groups=default_tool_groups,
             ),
         },

From 23d9f3b1fb495761ebf2811c97c0a02d21a24d64 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" 
Date: Mon, 12 May 2025 18:02:05 +0000
Subject: [PATCH 06/11] build: Bump version to 0.2.6

---
 pyproject.toml   |   6 +-
 requirements.txt | 139 ++++++++++++++++++++++++++++++++++++++++++++++-
 uv.lock          |  76 +++++++++++++-------------
 3 files changed, 179 insertions(+), 42 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3cc819be..ee180c4c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama_stack"
-version = "0.2.5"
+version = "0.2.6"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -27,7 +27,7 @@ dependencies = [
     "huggingface-hub",
     "jinja2>=3.1.6",
     "jsonschema",
-    "llama-stack-client>=0.2.5",
+    "llama-stack-client>=0.2.6",
     "openai>=1.66",
     "prompt-toolkit",
     "python-dotenv",
@@ -106,7 +106,7 @@ codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 ui = [
     "streamlit",
     "pandas",
-    "llama-stack-client>=0.2.5",
+    "llama-stack-client>=0.2.6",
     "streamlit-option-menu",
 ]
 
diff --git a/requirements.txt b/requirements.txt
index 194af774b..d5f69cd45 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,69 +1,206 @@
 # This file was autogenerated by uv via the following command:
 #    uv export --frozen --no-hashes --no-emit-project --output-file=requirements.txt
 annotated-types==0.7.0
+    # via pydantic
 anyio==4.8.0
+    # via
+    #   httpx
+    #   llama-stack-client
+    #   openai
 attrs==25.1.0
+    # via
+    #   jsonschema
+    #   referencing
 blobfile==3.0.0
+    # via llama-stack
 cachetools==5.5.2
+    # via google-auth
 certifi==2025.1.31
+    # via
+    #   httpcore
+    #   httpx
+    #   kubernetes
+    #   requests
 charset-normalizer==3.4.1
+    # via requests
 click==8.1.8
+    # via llama-stack-client
 colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   tqdm
 distro==1.9.0
+    # via
+    #   llama-stack-client
+    #   openai
 durationpy==0.9
+    # via kubernetes
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
+    # via anyio
 filelock==3.17.0
+    # via
+    #   blobfile
+    #   huggingface-hub
 fire==0.7.0
+    # via llama-stack
 fsspec==2024.12.0
+    # via huggingface-hub
 google-auth==2.38.0
+    # via kubernetes
 h11==0.16.0
+    # via
+    #   httpcore
+    #   llama-stack
 httpcore==1.0.9
+    # via httpx
 httpx==0.28.1
+    # via
+    #   llama-stack
+    #   llama-stack-client
+    #   openai
 huggingface-hub==0.29.0
+    # via llama-stack
 idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
 jinja2==3.1.6
+    # via llama-stack
 jiter==0.8.2
+    # via openai
 jsonschema==4.23.0
+    # via llama-stack
 jsonschema-specifications==2024.10.1
+    # via jsonschema
 kubernetes==32.0.1
-llama-stack-client==0.2.5
+    # via llama-stack
+llama-stack-client==0.2.6
+    # via llama-stack
 lxml==5.3.1
+    # via blobfile
 markdown-it-py==3.0.0
+    # via rich
 markupsafe==3.0.2
+    # via jinja2
 mdurl==0.1.2
+    # via markdown-it-py
 numpy==2.2.3
+    # via pandas
 oauthlib==3.2.2
+    # via
+    #   kubernetes
+    #   requests-oauthlib
 openai==1.71.0
+    # via llama-stack
 packaging==24.2
+    # via huggingface-hub
 pandas==2.2.3
+    # via llama-stack-client
 pillow==11.1.0
+    # via llama-stack
 prompt-toolkit==3.0.50
+    # via
+    #   llama-stack
+    #   llama-stack-client
 pyaml==25.1.0
+    # via llama-stack-client
 pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
 pyasn1-modules==0.4.2
+    # via google-auth
 pycryptodomex==3.21.0
+    # via blobfile
 pydantic==2.10.6
+    # via
+    #   llama-stack
+    #   llama-stack-client
+    #   openai
 pydantic-core==2.27.2
+    # via pydantic
 pygments==2.19.1
+    # via rich
 python-dateutil==2.9.0.post0
+    # via
+    #   kubernetes
+    #   pandas
 python-dotenv==1.0.1
+    # via llama-stack
 pytz==2025.1
+    # via pandas
 pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   kubernetes
+    #   pyaml
 referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
 regex==2024.11.6
+    # via tiktoken
 requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   kubernetes
+    #   llama-stack
+    #   requests-oauthlib
+    #   tiktoken
 requests-oauthlib==2.0.0
+    # via kubernetes
 rich==13.9.4
+    # via
+    #   llama-stack
+    #   llama-stack-client
 rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
 rsa==4.9
+    # via google-auth
 setuptools==75.8.0
+    # via llama-stack
 six==1.17.0
+    # via
+    #   kubernetes
+    #   python-dateutil
 sniffio==1.3.1
+    # via
+    #   anyio
+    #   llama-stack-client
+    #   openai
 termcolor==2.5.0
+    # via
+    #   fire
+    #   llama-stack
+    #   llama-stack-client
 tiktoken==0.9.0
+    # via llama-stack
 tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   llama-stack-client
+    #   openai
 typing-extensions==4.12.2
+    # via
+    #   anyio
+    #   huggingface-hub
+    #   llama-stack-client
+    #   openai
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   rich
 tzdata==2025.1
+    # via pandas
 urllib3==2.3.0
+    # via
+    #   blobfile
+    #   kubernetes
+    #   requests
 wcwidth==0.2.13
+    # via prompt-toolkit
 websocket-client==1.8.0
+    # via kubernetes
diff --git a/uv.lock b/uv.lock
index 3b953377d..048e6e202 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1419,7 +1419,7 @@ wheels = [
 
 [[package]]
 name = "llama-stack"
-version = "0.2.5"
+version = "0.2.6"
 source = { editable = "." }
 dependencies = [
     { name = "blobfile" },
@@ -1533,8 +1533,8 @@ requires-dist = [
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
     { name = "kubernetes" },
-    { name = "llama-stack-client", specifier = ">=0.2.5" },
-    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.5" },
+    { name = "llama-stack-client", specifier = ">=0.2.6" },
+    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.6" },
     { name = "mcp", marker = "extra == 'test'" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
@@ -1591,7 +1591,7 @@ provides-extras = ["dev", "unit", "test", "docs", "codegen", "ui"]
 
 [[package]]
 name = "llama-stack-client"
-version = "0.2.5"
+version = "0.2.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1608,9 +1608,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/38/4b/d4758a95a5eed8ad821dd782a3f34843d79dc9adc6bd6e01b13cbec904ca/llama_stack_client-0.2.5.tar.gz", hash = "sha256:509c6336027a13b8b89780a85bd1cd45b3659de3929357fd9d8113ea0d6a3f05", size = 259262, upload-time = "2025-05-03T21:30:29.177Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/14/1e/14e549b5fb7ac09347686f6f1c28ee2bcd16cf575aab934687f20b5cec12/llama_stack_client-0.2.6.tar.gz", hash = "sha256:a03a2b0bd43bdb0083378f481614bb65592d7f669a821d0b618b1dfc7d1c8325", size = 259270, upload-time = "2025-05-12T18:01:30.537Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e8/ac/50eb130fa6126a0a3a1f945b61dc5b3bb11f7badbe877ce0112321851d32/llama_stack_client-0.2.5-py3-none-any.whl", hash = "sha256:504abe51bdd1da658e00f720997e7845d10ca1eb74de52af90f82bfbce8e2e67", size = 292727, upload-time = "2025-05-03T21:30:27.388Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/af/93895ce23d3c8e676004e7b69deaea726e81acaaf9cf00090baace904c03/llama_stack_client-0.2.6-py3-none-any.whl", hash = "sha256:9f39dea2dba6767b654d5119f99dfc2b89f838470a547bc5d8def5a230decfcd", size = 292726, upload-time = "2025-05-12T18:01:29.14Z" },
 ]
 
 [[package]]
@@ -3423,22 +3423,22 @@ wheels = [
 name = "safetensors"
 version = "0.5.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/71/7e/2d5d6ee7b40c0682315367ec7475693d110f512922d582fef1bd4a63adc3/safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965", size = 67210 }
+sdist = { url = "https://files.pythonhosted.org/packages/71/7e/2d5d6ee7b40c0682315367ec7475693d110f512922d582fef1bd4a63adc3/safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965", size = 67210, upload-time = "2025-02-26T09:15:13.155Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/18/ae/88f6c49dbd0cc4da0e08610019a3c78a7d390879a919411a410a1876d03a/safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073", size = 436917 },
-    { url = "https://files.pythonhosted.org/packages/b8/3b/11f1b4a2f5d2ab7da34ecc062b0bc301f2be024d110a6466726bec8c055c/safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7", size = 418419 },
-    { url = "https://files.pythonhosted.org/packages/5d/9a/add3e6fef267658075c5a41573c26d42d80c935cdc992384dfae435feaef/safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467", size = 459493 },
-    { url = "https://files.pythonhosted.org/packages/df/5c/bf2cae92222513cc23b3ff85c4a1bb2811a2c3583ac0f8e8d502751de934/safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e", size = 472400 },
-    { url = "https://files.pythonhosted.org/packages/58/11/7456afb740bd45782d0f4c8e8e1bb9e572f1bf82899fb6ace58af47b4282/safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d", size = 522891 },
-    { url = "https://files.pythonhosted.org/packages/57/3d/fe73a9d2ace487e7285f6e157afee2383bd1ddb911b7cb44a55cf812eae3/safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9", size = 537694 },
-    { url = "https://files.pythonhosted.org/packages/a6/f8/dae3421624fcc87a89d42e1898a798bc7ff72c61f38973a65d60df8f124c/safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a", size = 471642 },
-    { url = "https://files.pythonhosted.org/packages/ce/20/1fbe16f9b815f6c5a672f5b760951e20e17e43f67f231428f871909a37f6/safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d", size = 502241 },
-    { url = "https://files.pythonhosted.org/packages/5f/18/8e108846b506487aa4629fe4116b27db65c3dde922de2c8e0cc1133f3f29/safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b", size = 638001 },
-    { url = "https://files.pythonhosted.org/packages/82/5a/c116111d8291af6c8c8a8b40628fe833b9db97d8141c2a82359d14d9e078/safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff", size = 734013 },
-    { url = "https://files.pythonhosted.org/packages/7d/ff/41fcc4d3b7de837963622e8610d998710705bbde9a8a17221d85e5d0baad/safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135", size = 670687 },
-    { url = "https://files.pythonhosted.org/packages/40/ad/2b113098e69c985a3d8fbda4b902778eae4a35b7d5188859b4a63d30c161/safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04", size = 643147 },
-    { url = "https://files.pythonhosted.org/packages/0a/0c/95aeb51d4246bd9a3242d3d8349c1112b4ee7611a4b40f0c5c93b05f001d/safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace", size = 296677 },
-    { url = "https://files.pythonhosted.org/packages/69/e2/b011c38e5394c4c18fb5500778a55ec43ad6106126e74723ffaee246f56e/safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11", size = 308878 },
+    { url = "https://files.pythonhosted.org/packages/18/ae/88f6c49dbd0cc4da0e08610019a3c78a7d390879a919411a410a1876d03a/safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073", size = 436917, upload-time = "2025-02-26T09:15:03.702Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/3b/11f1b4a2f5d2ab7da34ecc062b0bc301f2be024d110a6466726bec8c055c/safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7", size = 418419, upload-time = "2025-02-26T09:15:01.765Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/9a/add3e6fef267658075c5a41573c26d42d80c935cdc992384dfae435feaef/safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467", size = 459493, upload-time = "2025-02-26T09:14:51.812Z" },
+    { url = "https://files.pythonhosted.org/packages/df/5c/bf2cae92222513cc23b3ff85c4a1bb2811a2c3583ac0f8e8d502751de934/safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e", size = 472400, upload-time = "2025-02-26T09:14:53.549Z" },
+    { url = "https://files.pythonhosted.org/packages/58/11/7456afb740bd45782d0f4c8e8e1bb9e572f1bf82899fb6ace58af47b4282/safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d", size = 522891, upload-time = "2025-02-26T09:14:55.717Z" },
+    { url = "https://files.pythonhosted.org/packages/57/3d/fe73a9d2ace487e7285f6e157afee2383bd1ddb911b7cb44a55cf812eae3/safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9", size = 537694, upload-time = "2025-02-26T09:14:57.036Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/f8/dae3421624fcc87a89d42e1898a798bc7ff72c61f38973a65d60df8f124c/safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a", size = 471642, upload-time = "2025-02-26T09:15:00.544Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/20/1fbe16f9b815f6c5a672f5b760951e20e17e43f67f231428f871909a37f6/safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d", size = 502241, upload-time = "2025-02-26T09:14:58.303Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/18/8e108846b506487aa4629fe4116b27db65c3dde922de2c8e0cc1133f3f29/safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b", size = 638001, upload-time = "2025-02-26T09:15:05.79Z" },
+    { url = "https://files.pythonhosted.org/packages/82/5a/c116111d8291af6c8c8a8b40628fe833b9db97d8141c2a82359d14d9e078/safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff", size = 734013, upload-time = "2025-02-26T09:15:07.892Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/ff/41fcc4d3b7de837963622e8610d998710705bbde9a8a17221d85e5d0baad/safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135", size = 670687, upload-time = "2025-02-26T09:15:09.979Z" },
+    { url = "https://files.pythonhosted.org/packages/40/ad/2b113098e69c985a3d8fbda4b902778eae4a35b7d5188859b4a63d30c161/safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04", size = 643147, upload-time = "2025-02-26T09:15:11.185Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/0c/95aeb51d4246bd9a3242d3d8349c1112b4ee7611a4b40f0c5c93b05f001d/safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace", size = 296677, upload-time = "2025-02-26T09:15:16.554Z" },
+    { url = "https://files.pythonhosted.org/packages/69/e2/b011c38e5394c4c18fb5500778a55ec43ad6106126e74723ffaee246f56e/safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11", size = 308878, upload-time = "2025-02-26T09:15:14.99Z" },
 ]
 
 [[package]]
@@ -3864,22 +3864,22 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/92/76/5ac0c97f1117b91b7eb7323dcd61af80d72f790b4df71249a7850c195f30/tokenizers-0.21.1.tar.gz", hash = "sha256:a1bb04dc5b448985f86ecd4b05407f5a8d97cb2c0532199b2a302a604a0165ab", size = 343256 }
+sdist = { url = "https://files.pythonhosted.org/packages/92/76/5ac0c97f1117b91b7eb7323dcd61af80d72f790b4df71249a7850c195f30/tokenizers-0.21.1.tar.gz", hash = "sha256:a1bb04dc5b448985f86ecd4b05407f5a8d97cb2c0532199b2a302a604a0165ab", size = 343256, upload-time = "2025-03-13T10:51:18.189Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a5/1f/328aee25f9115bf04262e8b4e5a2050b7b7cf44b59c74e982db7270c7f30/tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41", size = 2780767 },
-    { url = "https://files.pythonhosted.org/packages/ae/1a/4526797f3719b0287853f12c5ad563a9be09d446c44ac784cdd7c50f76ab/tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cd51cd0a91ecc801633829fcd1fda9cf8682ed3477c6243b9a095539de4aecf3", size = 2650555 },
-    { url = "https://files.pythonhosted.org/packages/4d/7a/a209b29f971a9fdc1da86f917fe4524564924db50d13f0724feed37b2a4d/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28da6b72d4fb14ee200a1bd386ff74ade8992d7f725f2bde2c495a9a98cf4d9f", size = 2937541 },
-    { url = "https://files.pythonhosted.org/packages/3c/1e/b788b50ffc6191e0b1fc2b0d49df8cff16fe415302e5ceb89f619d12c5bc/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34d8cfde551c9916cb92014e040806122295a6800914bab5865deb85623931cf", size = 2819058 },
-    { url = "https://files.pythonhosted.org/packages/36/aa/3626dfa09a0ecc5b57a8c58eeaeb7dd7ca9a37ad9dd681edab5acd55764c/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaa852d23e125b73d283c98f007e06d4595732104b65402f46e8ef24b588d9f8", size = 3133278 },
-    { url = "https://files.pythonhosted.org/packages/a4/4d/8fbc203838b3d26269f944a89459d94c858f5b3f9a9b6ee9728cdcf69161/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a21a15d5c8e603331b8a59548bbe113564136dc0f5ad8306dd5033459a226da0", size = 3144253 },
-    { url = "https://files.pythonhosted.org/packages/d8/1b/2bd062adeb7c7511b847b32e356024980c0ffcf35f28947792c2d8ad2288/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2fdbd4c067c60a0ac7eca14b6bd18a5bebace54eb757c706b47ea93204f7a37c", size = 3398225 },
-    { url = "https://files.pythonhosted.org/packages/8a/63/38be071b0c8e06840bc6046991636bcb30c27f6bb1e670f4f4bc87cf49cc/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dd9a0061e403546f7377df940e866c3e678d7d4e9643d0461ea442b4f89e61a", size = 3038874 },
-    { url = "https://files.pythonhosted.org/packages/ec/83/afa94193c09246417c23a3c75a8a0a96bf44ab5630a3015538d0c316dd4b/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:db9484aeb2e200c43b915a1a0150ea885e35f357a5a8fabf7373af333dcc8dbf", size = 9014448 },
-    { url = "https://files.pythonhosted.org/packages/ae/b3/0e1a37d4f84c0f014d43701c11eb8072704f6efe8d8fc2dcdb79c47d76de/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6", size = 8937877 },
-    { url = "https://files.pythonhosted.org/packages/ac/33/ff08f50e6d615eb180a4a328c65907feb6ded0b8f990ec923969759dc379/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:9ac78b12e541d4ce67b4dfd970e44c060a2147b9b2a21f509566d556a509c67d", size = 9186645 },
-    { url = "https://files.pythonhosted.org/packages/5f/aa/8ae85f69a9f6012c6f8011c6f4aa1c96154c816e9eea2e1b758601157833/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e5a69c1a4496b81a5ee5d2c1f3f7fbdf95e90a0196101b0ee89ed9956b8a168f", size = 9384380 },
-    { url = "https://files.pythonhosted.org/packages/e8/5b/a5d98c89f747455e8b7a9504910c865d5e51da55e825a7ae641fb5ff0a58/tokenizers-0.21.1-cp39-abi3-win32.whl", hash = "sha256:1039a3a5734944e09de1d48761ade94e00d0fa760c0e0551151d4dd851ba63e3", size = 2239506 },
-    { url = "https://files.pythonhosted.org/packages/e6/b6/072a8e053ae600dcc2ac0da81a23548e3b523301a442a6ca900e92ac35be/tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382", size = 2435481 },
+    { url = "https://files.pythonhosted.org/packages/a5/1f/328aee25f9115bf04262e8b4e5a2050b7b7cf44b59c74e982db7270c7f30/tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41", size = 2780767, upload-time = "2025-03-13T10:51:09.459Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/1a/4526797f3719b0287853f12c5ad563a9be09d446c44ac784cdd7c50f76ab/tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cd51cd0a91ecc801633829fcd1fda9cf8682ed3477c6243b9a095539de4aecf3", size = 2650555, upload-time = "2025-03-13T10:51:07.692Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/7a/a209b29f971a9fdc1da86f917fe4524564924db50d13f0724feed37b2a4d/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28da6b72d4fb14ee200a1bd386ff74ade8992d7f725f2bde2c495a9a98cf4d9f", size = 2937541, upload-time = "2025-03-13T10:50:56.679Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/1e/b788b50ffc6191e0b1fc2b0d49df8cff16fe415302e5ceb89f619d12c5bc/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34d8cfde551c9916cb92014e040806122295a6800914bab5865deb85623931cf", size = 2819058, upload-time = "2025-03-13T10:50:59.525Z" },
+    { url = "https://files.pythonhosted.org/packages/36/aa/3626dfa09a0ecc5b57a8c58eeaeb7dd7ca9a37ad9dd681edab5acd55764c/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaa852d23e125b73d283c98f007e06d4595732104b65402f46e8ef24b588d9f8", size = 3133278, upload-time = "2025-03-13T10:51:04.678Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/4d/8fbc203838b3d26269f944a89459d94c858f5b3f9a9b6ee9728cdcf69161/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a21a15d5c8e603331b8a59548bbe113564136dc0f5ad8306dd5033459a226da0", size = 3144253, upload-time = "2025-03-13T10:51:01.261Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/1b/2bd062adeb7c7511b847b32e356024980c0ffcf35f28947792c2d8ad2288/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2fdbd4c067c60a0ac7eca14b6bd18a5bebace54eb757c706b47ea93204f7a37c", size = 3398225, upload-time = "2025-03-13T10:51:03.243Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/63/38be071b0c8e06840bc6046991636bcb30c27f6bb1e670f4f4bc87cf49cc/tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dd9a0061e403546f7377df940e866c3e678d7d4e9643d0461ea442b4f89e61a", size = 3038874, upload-time = "2025-03-13T10:51:06.235Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/83/afa94193c09246417c23a3c75a8a0a96bf44ab5630a3015538d0c316dd4b/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:db9484aeb2e200c43b915a1a0150ea885e35f357a5a8fabf7373af333dcc8dbf", size = 9014448, upload-time = "2025-03-13T10:51:10.927Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/b3/0e1a37d4f84c0f014d43701c11eb8072704f6efe8d8fc2dcdb79c47d76de/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6", size = 8937877, upload-time = "2025-03-13T10:51:12.688Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/33/ff08f50e6d615eb180a4a328c65907feb6ded0b8f990ec923969759dc379/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:9ac78b12e541d4ce67b4dfd970e44c060a2147b9b2a21f509566d556a509c67d", size = 9186645, upload-time = "2025-03-13T10:51:14.723Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/aa/8ae85f69a9f6012c6f8011c6f4aa1c96154c816e9eea2e1b758601157833/tokenizers-0.21.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e5a69c1a4496b81a5ee5d2c1f3f7fbdf95e90a0196101b0ee89ed9956b8a168f", size = 9384380, upload-time = "2025-03-13T10:51:16.526Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/5b/a5d98c89f747455e8b7a9504910c865d5e51da55e825a7ae641fb5ff0a58/tokenizers-0.21.1-cp39-abi3-win32.whl", hash = "sha256:1039a3a5734944e09de1d48761ade94e00d0fa760c0e0551151d4dd851ba63e3", size = 2239506, upload-time = "2025-03-13T10:51:20.643Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/b6/072a8e053ae600dcc2ac0da81a23548e3b523301a442a6ca900e92ac35be/tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382", size = 2435481, upload-time = "2025-03-13T10:51:19.243Z" },
 ]
 
 [[package]]
@@ -4108,9 +4108,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c0/29/37877123d6633a188997d75dc17d6f526745d63361794348ce748db23d49/transformers-4.50.3.tar.gz", hash = "sha256:1d795d24925e615a8e63687d077e4f7348c2702eb87032286eaa76d83cdc684f", size = 8774363 }
+sdist = { url = "https://files.pythonhosted.org/packages/c0/29/37877123d6633a188997d75dc17d6f526745d63361794348ce748db23d49/transformers-4.50.3.tar.gz", hash = "sha256:1d795d24925e615a8e63687d077e4f7348c2702eb87032286eaa76d83cdc684f", size = 8774363, upload-time = "2025-03-28T18:21:02.878Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/aa/22/733a6fc4a6445d835242f64c490fdd30f4a08d58f2b788613de3f9170692/transformers-4.50.3-py3-none-any.whl", hash = "sha256:6111610a43dec24ef32c3df0632c6b25b07d9711c01d9e1077bdd2ff6b14a38c", size = 10180411 },
+    { url = "https://files.pythonhosted.org/packages/aa/22/733a6fc4a6445d835242f64c490fdd30f4a08d58f2b788613de3f9170692/transformers-4.50.3-py3-none-any.whl", hash = "sha256:6111610a43dec24ef32c3df0632c6b25b07d9711c01d9e1077bdd2ff6b14a38c", size = 10180411, upload-time = "2025-03-28T18:20:59.265Z" },
 ]
 
 [[package]]

From a5d14749a57d103518b5b82188f69eb5e3ea101d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= 
Date: Mon, 12 May 2025 21:45:35 +0200
Subject: [PATCH 07/11] chore: rehydrate requirements.txt (#2146)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Hiccup with 0.2.6 bot release?

Signed-off-by: Sébastien Han 
---
 requirements.txt | 137 -----------------------------------------------
 1 file changed, 137 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d5f69cd45..1a755bae0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,206 +1,69 @@
 # This file was autogenerated by uv via the following command:
 #    uv export --frozen --no-hashes --no-emit-project --output-file=requirements.txt
 annotated-types==0.7.0
-    # via pydantic
 anyio==4.8.0
-    # via
-    #   httpx
-    #   llama-stack-client
-    #   openai
 attrs==25.1.0
-    # via
-    #   jsonschema
-    #   referencing
 blobfile==3.0.0
-    # via llama-stack
 cachetools==5.5.2
-    # via google-auth
 certifi==2025.1.31
-    # via
-    #   httpcore
-    #   httpx
-    #   kubernetes
-    #   requests
 charset-normalizer==3.4.1
-    # via requests
 click==8.1.8
-    # via llama-stack-client
 colorama==0.4.6 ; sys_platform == 'win32'
-    # via
-    #   click
-    #   tqdm
 distro==1.9.0
-    # via
-    #   llama-stack-client
-    #   openai
 durationpy==0.9
-    # via kubernetes
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
-    # via anyio
 filelock==3.17.0
-    # via
-    #   blobfile
-    #   huggingface-hub
 fire==0.7.0
-    # via llama-stack
 fsspec==2024.12.0
-    # via huggingface-hub
 google-auth==2.38.0
-    # via kubernetes
 h11==0.16.0
-    # via
-    #   httpcore
-    #   llama-stack
 httpcore==1.0.9
-    # via httpx
 httpx==0.28.1
-    # via
-    #   llama-stack
-    #   llama-stack-client
-    #   openai
 huggingface-hub==0.29.0
-    # via llama-stack
 idna==3.10
-    # via
-    #   anyio
-    #   httpx
-    #   requests
 jinja2==3.1.6
-    # via llama-stack
 jiter==0.8.2
-    # via openai
 jsonschema==4.23.0
-    # via llama-stack
 jsonschema-specifications==2024.10.1
-    # via jsonschema
 kubernetes==32.0.1
-    # via llama-stack
 llama-stack-client==0.2.6
-    # via llama-stack
 lxml==5.3.1
-    # via blobfile
 markdown-it-py==3.0.0
-    # via rich
 markupsafe==3.0.2
-    # via jinja2
 mdurl==0.1.2
-    # via markdown-it-py
 numpy==2.2.3
-    # via pandas
 oauthlib==3.2.2
-    # via
-    #   kubernetes
-    #   requests-oauthlib
 openai==1.71.0
-    # via llama-stack
 packaging==24.2
-    # via huggingface-hub
 pandas==2.2.3
-    # via llama-stack-client
 pillow==11.1.0
-    # via llama-stack
 prompt-toolkit==3.0.50
-    # via
-    #   llama-stack
-    #   llama-stack-client
 pyaml==25.1.0
-    # via llama-stack-client
 pyasn1==0.6.1
-    # via
-    #   pyasn1-modules
-    #   rsa
 pyasn1-modules==0.4.2
-    # via google-auth
 pycryptodomex==3.21.0
-    # via blobfile
 pydantic==2.10.6
-    # via
-    #   llama-stack
-    #   llama-stack-client
-    #   openai
 pydantic-core==2.27.2
-    # via pydantic
 pygments==2.19.1
-    # via rich
 python-dateutil==2.9.0.post0
-    # via
-    #   kubernetes
-    #   pandas
 python-dotenv==1.0.1
-    # via llama-stack
 pytz==2025.1
-    # via pandas
 pyyaml==6.0.2
-    # via
-    #   huggingface-hub
-    #   kubernetes
-    #   pyaml
 referencing==0.36.2
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
 regex==2024.11.6
-    # via tiktoken
 requests==2.32.3
-    # via
-    #   huggingface-hub
-    #   kubernetes
-    #   llama-stack
-    #   requests-oauthlib
-    #   tiktoken
 requests-oauthlib==2.0.0
-    # via kubernetes
 rich==13.9.4
-    # via
-    #   llama-stack
-    #   llama-stack-client
 rpds-py==0.22.3
-    # via
-    #   jsonschema
-    #   referencing
 rsa==4.9
-    # via google-auth
 setuptools==75.8.0
-    # via llama-stack
 six==1.17.0
-    # via
-    #   kubernetes
-    #   python-dateutil
 sniffio==1.3.1
-    # via
-    #   anyio
-    #   llama-stack-client
-    #   openai
 termcolor==2.5.0
-    # via
-    #   fire
-    #   llama-stack
-    #   llama-stack-client
 tiktoken==0.9.0
-    # via llama-stack
 tqdm==4.67.1
-    # via
-    #   huggingface-hub
-    #   llama-stack-client
-    #   openai
 typing-extensions==4.12.2
-    # via
-    #   anyio
-    #   huggingface-hub
-    #   llama-stack-client
-    #   openai
-    #   pydantic
-    #   pydantic-core
-    #   referencing
-    #   rich
 tzdata==2025.1
-    # via pandas
 urllib3==2.3.0
-    # via
-    #   blobfile
-    #   kubernetes
-    #   requests
 wcwidth==0.2.13
-    # via prompt-toolkit
 websocket-client==1.8.0
-    # via kubernetes

From e3ad17ec5e2e10b668aa9288d02b92319085ffaa Mon Sep 17 00:00:00 2001
From: grs 
Date: Mon, 12 May 2025 17:08:36 -0400
Subject: [PATCH 08/11] feat: enable mutual tls (#2140)

# What does this PR do?
This adds a config option for a CA to be specified with which client
certs are verified. If specified client certs are required. This offers
a simple way of securing access to the server.

(Note: at present it is not possible to access the details of the client
certificate using uvicorn (unless it was monkey patched). Though there
is a defined TLS extension for ASGI, this is not implemented in uvicorn
pending a review and likely change to the specification. See
https://github.com/encode/uvicorn/pull/1119 and
https://github.com/django/asgiref/issues/466. Without access to the DN
it isn't possible to set user access attributes for a mutually
authentication tls connection, so more fine grained access control is
not yet possible).

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
Used proposed config option to specify a CA and verified that the server
can only be accessed with a valid client certificate.

[//]: # (## Documentation)

Signed-off-by: Gordon Sim 
---
 llama_stack/distribution/datatypes.py     |  4 ++++
 llama_stack/distribution/server/server.py | 10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 7de009b87..d36e21c6d 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -249,6 +249,10 @@ class ServerConfig(BaseModel):
         default=None,
         description="Path to TLS key file for HTTPS",
     )
+    tls_cafile: str | None = Field(
+        default=None,
+        description="Path to TLS CA file for HTTPS with mutual TLS authentication",
+    )
     auth: AuthenticationConfig | None = Field(
         default=None,
         description="Authentication configuration for the server",
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index e34a62b00..32046d2b1 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -9,6 +9,7 @@ import asyncio
 import inspect
 import json
 import os
+import ssl
 import sys
 import traceback
 import warnings
@@ -484,7 +485,14 @@ def main(args: argparse.Namespace | None = None):
             "ssl_keyfile": keyfile,
             "ssl_certfile": certfile,
         }
-        logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
+        if config.server.tls_cafile:
+            ssl_config["ssl_ca_certs"] = config.server.tls_cafile
+            ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
+            logger.info(
+                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
+            )
+        else:
+            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
 
     listen_host = ["::", "0.0.0.0"] if not config.server.disable_ipv6 else "0.0.0.0"
     logger.info(f"Listening on {listen_host}:{port}")

From 62476a5373e096d5e795866088ed756cb75b70b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= 
Date: Tue, 13 May 2025 20:27:29 +0200
Subject: [PATCH 09/11] fix: pytest reports (#2152)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

While adding other tests, I came across this and wasn’t sure how useful
it is. It doesn’t seem to be exercised anywhere in CI, but I figured I’d
fix it anyway. Happy to remove it if preferred. :)

## Test Plan

Run:

```
uv run pytest tests/integration/inference --stack-config=ollama --report=test_report.md -v --text-model="llama3.2:3b" --embedding-model=all-MiniLM-L6-v2
```

Look at the produced `test_report.md`.

Signed-off-by: Sébastien Han 
---
 tests/integration/report.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/integration/report.py b/tests/integration/report.py
index a50f51d3f..97543fa9d 100644
--- a/tests/integration/report.py
+++ b/tests/integration/report.py
@@ -6,6 +6,7 @@
 
 
 from collections import defaultdict
+from pathlib import Path
 
 import pytest
 from pytest import CollectReport
@@ -65,6 +66,7 @@ class Report:
     def __init__(self, config):
         self.distro_name = None
         self.config = config
+        self.output_path = Path(config.getoption("--report")) if config.getoption("--report") else None
 
         stack_config = self.config.getoption("--stack-config")
         if stack_config:
@@ -161,7 +163,7 @@ class Report:
                 "|:-----|:-----|:-----|:-----|:-----|",
             ]
             provider = [p for p in providers if p.api == str(api_group.name)]
-            provider_str = ",".join(provider) if provider else ""
+            provider_str = ",".join(str(p) for p in provider) if provider else ""
             for api, capa_map in API_MAPS[api_group].items():
                 for capa, tests in capa_map.items():
                     for test_name in tests:
@@ -184,10 +186,12 @@ class Report:
 
         # Get values from fixtures for report output
         if model_id := item.funcargs.get("text_model_id"):
-            text_model = model_id.split("/")[1]
+            parts = model_id.split("/")
+            text_model = parts[1] if len(parts) > 1 else model_id
             self.text_model_id = self.text_model_id or text_model
         elif model_id := item.funcargs.get("vision_model_id"):
-            vision_model = model_id.split("/")[1]
+            parts = model_id.split("/")
+            vision_model = parts[1] if len(parts) > 1 else model_id
             self.vision_model_id = self.vision_model_id or vision_model
 
         if not self.client:

From e0d10dd0b1e865119497ca134664224f2e3538af Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Tue, 13 May 2025 14:28:29 -0400
Subject: [PATCH 10/11] docs: revamp testing documentation (#2155)

# What does this PR do?
reduces duplication and centralizes information to be easier to find for
contributors

Signed-off-by: Nathan Weinberg 
---
 CONTRIBUTING.md                              | 26 ++------------------
 docs/source/contributing/new_api_provider.md |  3 ++-
 tests/README.md                              |  9 +++++++
 tests/integration/README.md                  |  2 +-
 tests/unit/README.md                         | 21 ++++++++++++++++
 tests/verifications/README.md                |  2 +-
 6 files changed, 36 insertions(+), 27 deletions(-)
 create mode 100644 tests/README.md
 create mode 100644 tests/unit/README.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bd63b31d4..d7c3e3e2f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -110,31 +110,9 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 
-## Running unit tests
+## Running tests
 
-You can run the unit tests by running:
-
-```bash
-source .venv/bin/activate
-./scripts/unit-tests.sh [PYTEST_ARGS]
-```
-
-Any additional arguments are passed to pytest. For example, you can specify a test directory, a specific test file, or any pytest flags (e.g., -vvv for verbosity). If no test directory is specified, it defaults to "tests/unit", e.g:
-
-```bash
-./scripts/unit-tests.sh tests/unit/registry/test_registry.py -vvv
-```
-
-If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
-
-```
-source .venv/bin/activate
-PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
-```
-
-## Running integration tests
-
-You can run integration tests following the instructions [here](tests/integration/README.md).
+You can find the Llama Stack testing documentation here [here](tests/README.md).
 
 ## Adding a new dependency to the project
 
diff --git a/docs/source/contributing/new_api_provider.md b/docs/source/contributing/new_api_provider.md
index c412a350b..83058896a 100644
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
-- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 
 
 Here are some example PRs to help you get started:
@@ -33,6 +33,7 @@ Note that each provider's `sample_run_config()` method (in the configuration cla
 
 Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
 
+Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
 
 ### 3. Additional end-to-end testing
 
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 000000000..ed7064bfb
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,9 @@
+# Llama Stack Tests
+
+Llama Stack has multiple layers of testing done to ensure continuous functionality and prevent regressions to the codebase.
+
+| Testing Type | Details |
+|--------------|---------|
+| Unit | [unit/README.md](unit/README.md) |
+| Integration | [integration/README.md](integration/README.md) |
+| Verification | [verifications/README.md](verifications/README.md) |
diff --git a/tests/integration/README.md b/tests/integration/README.md
index 92bcf7c51..8c1ee6355 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -11,7 +11,7 @@ pytest --help
 Here are the most important options:
 - `--stack-config`: specify the stack config to use. You have three ways to point to a stack:
   - a URL which points to a Llama Stack distribution server
-  - a template (e.g., `fireworks`, `together`) or a path to a run.yaml file
+  - a template (e.g., `fireworks`, `together`) or a path to a `run.yaml` file
   - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
 - `--env`: set environment variables, e.g. --env KEY=value. this is a utility option to set environment variables required by various providers.
 
diff --git a/tests/unit/README.md b/tests/unit/README.md
new file mode 100644
index 000000000..db2114049
--- /dev/null
+++ b/tests/unit/README.md
@@ -0,0 +1,21 @@
+# Llama Stack Unit Tests
+
+You can run the unit tests by running:
+
+```bash
+source .venv/bin/activate
+./scripts/unit-tests.sh [PYTEST_ARGS]
+```
+
+Any additional arguments are passed to pytest. For example, you can specify a test directory, a specific test file, or any pytest flags (e.g., -vvv for verbosity). If no test directory is specified, it defaults to "tests/unit", e.g:
+
+```bash
+./scripts/unit-tests.sh tests/unit/registry/test_registry.py -vvv
+```
+
+If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
+
+```
+source .venv/bin/activate
+PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
+```
diff --git a/tests/verifications/README.md b/tests/verifications/README.md
index 88762e0ba..19d122bec 100644
--- a/tests/verifications/README.md
+++ b/tests/verifications/README.md
@@ -4,7 +4,7 @@ Llama Stack Verifications provide standardized test suites to ensure API compati
 
 ## Overview
 
-This framework allows you to run the same set of verification tests against different LLM providers'  OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.
+This framework allows you to run the same set of verification tests against different LLM providers' OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.
 
 ## Features
 

From 8e316c9b1ed3f12117eb33eeab1e342b897230e9 Mon Sep 17 00:00:00 2001
From: Ben Browning 
Date: Tue, 13 May 2025 14:29:15 -0400
Subject: [PATCH 11/11] feat: function tools in OpenAI Responses (#2094)

# What does this PR do?

This is a combination of what was previously 3 separate PRs - #2069,
#2075, and #2083. It turns out all 3 of those are needed to land a
working function calling Responses implementation. The web search
builtin tool was already working, but this wires in support for custom
function calling.

I ended up combining all three into one PR because they all had lots of
merge conflicts, both with each other but also with #1806 that just
landed. And, because landing any of them individually would have only
left a partially working implementation merged.

The new things added here are:
* Storing of input items from previous responses and restoring of those
input items when adding previous responses to the conversation state
* Handling of multiple input item messages roles, not just "user"
messages.
* Support for custom tools passed into the Responses API to enable
function calling outside of just the builtin websearch tool.

Closes #2074
Closes #2080

## Test Plan

### Unit Tests

Several new unit tests were added, and they all pass. Ran via:

```
python -m pytest -s -v tests/unit/providers/agents/meta_reference/test_openai_responses.py
```

### Responses API Verification Tests

I ran our verification run.yaml against multiple providers to ensure we
were getting a decent pass rate. Specifically, I ensured the new custom
tool verification test passed across multiple providers and that the
multi-turn examples passed across at least some of the providers (some
providers struggle with the multi-turn workflows still).

Running the stack setup for verification testing:

```
llama stack run --image-type venv tests/verifications/openai-api-verification-run.yaml
```

Together, passing 100% as an example:

```
pytest -s -v 'tests/verifications/openai_api/test_responses.py' --provider=together-llama-stack
```

## Documentation

We will need to start documenting the OpenAI APIs, but for now the
Responses stuff is still rapidly evolving so delaying that.

---------

Signed-off-by: Derek Higgins 
Signed-off-by: Ben Browning 
Co-authored-by: Derek Higgins 
Co-authored-by: Ashwin Bharambe 
---
 docs/_static/llama-stack-spec.html            | 408 +++++++++++++-----
 docs/_static/llama-stack-spec.yaml            | 280 ++++++++----
 llama_stack/apis/agents/agents.py             |   4 +-
 llama_stack/apis/agents/openai_responses.py   | 127 ++++--
 llama_stack/distribution/server/server.py     |  20 +
 .../inline/agents/meta_reference/agents.py    |   4 +-
 .../agents/meta_reference/openai_responses.py | 292 ++++++++++---
 .../meta_reference/fixtures/__init__.py       |  23 +
 .../fixtures/simple_chat_completion.yaml      |   9 +
 .../fixtures/tool_call_completion.yaml        |  14 +
 .../meta_reference/test_openai_responses.py   | 246 ++++++++---
 .../fixtures/test_cases/responses.yaml        |  20 +
 .../openai_api/test_responses.py              |  22 +
 13 files changed, 1099 insertions(+), 370 deletions(-)
 create mode 100644 tests/unit/providers/agents/meta_reference/fixtures/__init__.py
 create mode 100644 tests/unit/providers/agents/meta_reference/fixtures/simple_chat_completion.yaml
 create mode 100644 tests/unit/providers/agents/meta_reference/fixtures/tool_call_completion.yaml

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 4020dc4cd..f1bde880b 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6466,54 +6466,51 @@
                 ],
                 "title": "AgentTurnResponseTurnStartPayload"
             },
-            "OpenAIResponseInputMessage": {
+            "OpenAIResponseInput": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseMessage"
+                    }
+                ]
+            },
+            "OpenAIResponseInputFunctionToolCallOutput": {
                 "type": "object",
                 "properties": {
-                    "content": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
-                                }
-                            }
-                        ]
+                    "call_id": {
+                        "type": "string"
                     },
-                    "role": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "const": "system"
-                            },
-                            {
-                                "type": "string",
-                                "const": "developer"
-                            },
-                            {
-                                "type": "string",
-                                "const": "user"
-                            },
-                            {
-                                "type": "string",
-                                "const": "assistant"
-                            }
-                        ]
+                    "output": {
+                        "type": "string"
                     },
                     "type": {
                         "type": "string",
-                        "const": "message",
-                        "default": "message"
+                        "const": "function_call_output",
+                        "default": "function_call_output"
+                    },
+                    "id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "content",
-                    "role"
+                    "call_id",
+                    "output",
+                    "type"
                 ],
-                "title": "OpenAIResponseInputMessage"
+                "title": "OpenAIResponseInputFunctionToolCallOutput",
+                "description": "This represents the output of a function call that gets passed back to the model."
             },
             "OpenAIResponseInputMessageContent": {
                 "oneOf": [
@@ -6588,6 +6585,113 @@
                 "title": "OpenAIResponseInputMessageContentText"
             },
             "OpenAIResponseInputTool": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputToolWebSearch"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputToolFileSearch"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputToolFunction"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "web_search": "#/components/schemas/OpenAIResponseInputToolWebSearch",
+                        "file_search": "#/components/schemas/OpenAIResponseInputToolFileSearch",
+                        "function": "#/components/schemas/OpenAIResponseInputToolFunction"
+                    }
+                }
+            },
+            "OpenAIResponseInputToolFileSearch": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "file_search",
+                        "default": "file_search"
+                    },
+                    "vector_store_id": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "ranking_options": {
+                        "type": "object",
+                        "properties": {
+                            "ranker": {
+                                "type": "string"
+                            },
+                            "score_threshold": {
+                                "type": "number",
+                                "default": 0.0
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "FileSearchRankingOptions"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "vector_store_id"
+                ],
+                "title": "OpenAIResponseInputToolFileSearch"
+            },
+            "OpenAIResponseInputToolFunction": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "function",
+                        "default": "function"
+                    },
+                    "name": {
+                        "type": "string"
+                    },
+                    "description": {
+                        "type": "string"
+                    },
+                    "parameters": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    },
+                    "strict": {
+                        "type": "boolean"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "name"
+                ],
+                "title": "OpenAIResponseInputToolFunction"
+            },
+            "OpenAIResponseInputToolWebSearch": {
                 "type": "object",
                 "properties": {
                     "type": {
@@ -6614,6 +6718,146 @@
                 ],
                 "title": "OpenAIResponseInputToolWebSearch"
             },
+            "OpenAIResponseMessage": {
+                "type": "object",
+                "properties": {
+                    "content": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
+                                }
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent"
+                                }
+                            }
+                        ]
+                    },
+                    "role": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "system"
+                            },
+                            {
+                                "type": "string",
+                                "const": "developer"
+                            },
+                            {
+                                "type": "string",
+                                "const": "user"
+                            },
+                            {
+                                "type": "string",
+                                "const": "assistant"
+                            }
+                        ]
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "message",
+                        "default": "message"
+                    },
+                    "id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "content",
+                    "role",
+                    "type"
+                ],
+                "title": "OpenAIResponseMessage",
+                "description": "Corresponds to the various Message types in the Responses API. They are all under one type because the Responses API gives them all the same \"type\" value, and there is no way to tell them apart in certain scenarios."
+            },
+            "OpenAIResponseOutputMessageContent": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "output_text",
+                        "default": "output_text"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "text",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageContentOutputText"
+            },
+            "OpenAIResponseOutputMessageFunctionToolCall": {
+                "type": "object",
+                "properties": {
+                    "arguments": {
+                        "type": "string"
+                    },
+                    "call_id": {
+                        "type": "string"
+                    },
+                    "name": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "function_call",
+                        "default": "function_call"
+                    },
+                    "id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "arguments",
+                    "call_id",
+                    "name",
+                    "type",
+                    "id",
+                    "status"
+                ],
+                "title": "OpenAIResponseOutputMessageFunctionToolCall"
+            },
+            "OpenAIResponseOutputMessageWebSearchToolCall": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "web_search_call",
+                        "default": "web_search_call"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "status",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageWebSearchToolCall"
+            },
             "CreateOpenaiResponseRequest": {
                 "type": "object",
                 "properties": {
@@ -6625,7 +6869,7 @@
                             {
                                 "type": "array",
                                 "items": {
-                                    "$ref": "#/components/schemas/OpenAIResponseInputMessage"
+                                    "$ref": "#/components/schemas/OpenAIResponseInput"
                                 }
                             }
                         ],
@@ -6743,98 +6987,24 @@
             "OpenAIResponseOutput": {
                 "oneOf": [
                     {
-                        "$ref": "#/components/schemas/OpenAIResponseOutputMessage"
+                        "$ref": "#/components/schemas/OpenAIResponseMessage"
                     },
                     {
                         "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                     }
                 ],
                 "discriminator": {
                     "propertyName": "type",
                     "mapping": {
-                        "message": "#/components/schemas/OpenAIResponseOutputMessage",
-                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                        "message": "#/components/schemas/OpenAIResponseMessage",
+                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
+                        "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                     }
                 }
             },
-            "OpenAIResponseOutputMessage": {
-                "type": "object",
-                "properties": {
-                    "id": {
-                        "type": "string"
-                    },
-                    "content": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent"
-                        }
-                    },
-                    "role": {
-                        "type": "string",
-                        "const": "assistant",
-                        "default": "assistant"
-                    },
-                    "status": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "message",
-                        "default": "message"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "id",
-                    "content",
-                    "role",
-                    "status",
-                    "type"
-                ],
-                "title": "OpenAIResponseOutputMessage"
-            },
-            "OpenAIResponseOutputMessageContent": {
-                "type": "object",
-                "properties": {
-                    "text": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "output_text",
-                        "default": "output_text"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "text",
-                    "type"
-                ],
-                "title": "OpenAIResponseOutputMessageContentOutputText"
-            },
-            "OpenAIResponseOutputMessageWebSearchToolCall": {
-                "type": "object",
-                "properties": {
-                    "id": {
-                        "type": "string"
-                    },
-                    "status": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "web_search_call",
-                        "default": "web_search_call"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "id",
-                    "status",
-                    "type"
-                ],
-                "title": "OpenAIResponseOutputMessageWebSearchToolCall"
-            },
             "OpenAIResponseObjectStream": {
                 "oneOf": [
                     {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 62e3ca85c..10b5deec2 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -4534,34 +4534,37 @@ components:
         - event_type
         - turn_id
       title: AgentTurnResponseTurnStartPayload
-    OpenAIResponseInputMessage:
+    OpenAIResponseInput:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
+        - $ref: '#/components/schemas/OpenAIResponseMessage'
+    "OpenAIResponseInputFunctionToolCallOutput":
       type: object
       properties:
-        content:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
-        role:
-          oneOf:
-            - type: string
-              const: system
-            - type: string
-              const: developer
-            - type: string
-              const: user
-            - type: string
-              const: assistant
+        call_id:
+          type: string
+        output:
+          type: string
         type:
           type: string
-          const: message
-          default: message
+          const: function_call_output
+          default: function_call_output
+        id:
+          type: string
+        status:
+          type: string
       additionalProperties: false
       required:
-        - content
-        - role
-      title: OpenAIResponseInputMessage
+        - call_id
+        - output
+        - type
+      title: >-
+        OpenAIResponseInputFunctionToolCallOutput
+      description: >-
+        This represents the output of a function call that gets passed back to the
+        model.
     OpenAIResponseInputMessageContent:
       oneOf:
         - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
@@ -4609,6 +4612,71 @@ components:
         - type
       title: OpenAIResponseInputMessageContentText
     OpenAIResponseInputTool:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
+        - $ref: '#/components/schemas/OpenAIResponseInputToolFileSearch'
+        - $ref: '#/components/schemas/OpenAIResponseInputToolFunction'
+      discriminator:
+        propertyName: type
+        mapping:
+          web_search: '#/components/schemas/OpenAIResponseInputToolWebSearch'
+          file_search: '#/components/schemas/OpenAIResponseInputToolFileSearch'
+          function: '#/components/schemas/OpenAIResponseInputToolFunction'
+    OpenAIResponseInputToolFileSearch:
+      type: object
+      properties:
+        type:
+          type: string
+          const: file_search
+          default: file_search
+        vector_store_id:
+          type: array
+          items:
+            type: string
+        ranking_options:
+          type: object
+          properties:
+            ranker:
+              type: string
+            score_threshold:
+              type: number
+              default: 0.0
+          additionalProperties: false
+          title: FileSearchRankingOptions
+      additionalProperties: false
+      required:
+        - type
+        - vector_store_id
+      title: OpenAIResponseInputToolFileSearch
+    OpenAIResponseInputToolFunction:
+      type: object
+      properties:
+        type:
+          type: string
+          const: function
+          default: function
+        name:
+          type: string
+        description:
+          type: string
+        parameters:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        strict:
+          type: boolean
+      additionalProperties: false
+      required:
+        - type
+        - name
+      title: OpenAIResponseInputToolFunction
+    OpenAIResponseInputToolWebSearch:
       type: object
       properties:
         type:
@@ -4625,6 +4693,106 @@ components:
       required:
         - type
       title: OpenAIResponseInputToolWebSearch
+    OpenAIResponseMessage:
+      type: object
+      properties:
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIResponseOutputMessageContent'
+        role:
+          oneOf:
+            - type: string
+              const: system
+            - type: string
+              const: developer
+            - type: string
+              const: user
+            - type: string
+              const: assistant
+        type:
+          type: string
+          const: message
+          default: message
+        id:
+          type: string
+        status:
+          type: string
+      additionalProperties: false
+      required:
+        - content
+        - role
+        - type
+      title: OpenAIResponseMessage
+      description: >-
+        Corresponds to the various Message types in the Responses API. They are all
+        under one type because the Responses API gives them all the same "type" value,
+        and there is no way to tell them apart in certain scenarios.
+    OpenAIResponseOutputMessageContent:
+      type: object
+      properties:
+        text:
+          type: string
+        type:
+          type: string
+          const: output_text
+          default: output_text
+      additionalProperties: false
+      required:
+        - text
+        - type
+      title: >-
+        OpenAIResponseOutputMessageContentOutputText
+    "OpenAIResponseOutputMessageFunctionToolCall":
+      type: object
+      properties:
+        arguments:
+          type: string
+        call_id:
+          type: string
+        name:
+          type: string
+        type:
+          type: string
+          const: function_call
+          default: function_call
+        id:
+          type: string
+        status:
+          type: string
+      additionalProperties: false
+      required:
+        - arguments
+        - call_id
+        - name
+        - type
+        - id
+        - status
+      title: >-
+        OpenAIResponseOutputMessageFunctionToolCall
+    "OpenAIResponseOutputMessageWebSearchToolCall":
+      type: object
+      properties:
+        id:
+          type: string
+        status:
+          type: string
+        type:
+          type: string
+          const: web_search_call
+          default: web_search_call
+      additionalProperties: false
+      required:
+        - id
+        - status
+        - type
+      title: >-
+        OpenAIResponseOutputMessageWebSearchToolCall
     CreateOpenaiResponseRequest:
       type: object
       properties:
@@ -4633,7 +4801,7 @@ components:
             - type: string
             - type: array
               items:
-                $ref: '#/components/schemas/OpenAIResponseInputMessage'
+                $ref: '#/components/schemas/OpenAIResponseInput'
           description: Input message(s) to create the response.
         model:
           type: string
@@ -4717,73 +4885,15 @@ components:
       title: OpenAIResponseObject
     OpenAIResponseOutput:
       oneOf:
-        - $ref: '#/components/schemas/OpenAIResponseOutputMessage'
+        - $ref: '#/components/schemas/OpenAIResponseMessage'
         - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
       discriminator:
         propertyName: type
         mapping:
-          message: '#/components/schemas/OpenAIResponseOutputMessage'
+          message: '#/components/schemas/OpenAIResponseMessage'
           web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
-    OpenAIResponseOutputMessage:
-      type: object
-      properties:
-        id:
-          type: string
-        content:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAIResponseOutputMessageContent'
-        role:
-          type: string
-          const: assistant
-          default: assistant
-        status:
-          type: string
-        type:
-          type: string
-          const: message
-          default: message
-      additionalProperties: false
-      required:
-        - id
-        - content
-        - role
-        - status
-        - type
-      title: OpenAIResponseOutputMessage
-    OpenAIResponseOutputMessageContent:
-      type: object
-      properties:
-        text:
-          type: string
-        type:
-          type: string
-          const: output_text
-          default: output_text
-      additionalProperties: false
-      required:
-        - text
-        - type
-      title: >-
-        OpenAIResponseOutputMessageContentOutputText
-    "OpenAIResponseOutputMessageWebSearchToolCall":
-      type: object
-      properties:
-        id:
-          type: string
-        status:
-          type: string
-        type:
-          type: string
-          const: web_search_call
-          default: web_search_call
-      additionalProperties: false
-      required:
-        - id
-        - status
-        - type
-      title: >-
-        OpenAIResponseOutputMessageWebSearchToolCall
+          function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
     OpenAIResponseObjectStream:
       oneOf:
         - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index f4367d09b..2a37f27c0 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -31,7 +31,7 @@ from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
 from .openai_responses import (
-    OpenAIResponseInputMessage,
+    OpenAIResponseInput,
     OpenAIResponseInputTool,
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
@@ -593,7 +593,7 @@ class Agents(Protocol):
     @webmethod(route="/openai/v1/responses", method="POST")
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInputMessage],
+        input: str | list[OpenAIResponseInput],
         model: str,
         previous_response_id: str | None = None,
         store: bool | None = True,
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
index 8e11b2123..dcf0c7f9c 100644
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Annotated, Literal
+from typing import Annotated, Any, Literal
 
 from pydantic import BaseModel, Field
 
@@ -17,6 +17,28 @@ class OpenAIResponseError(BaseModel):
     message: str
 
 
+@json_schema_type
+class OpenAIResponseInputMessageContentText(BaseModel):
+    text: str
+    type: Literal["input_text"] = "input_text"
+
+
+@json_schema_type
+class OpenAIResponseInputMessageContentImage(BaseModel):
+    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
+    type: Literal["input_image"] = "input_image"
+    # TODO: handle file_id
+    image_url: str | None = None
+
+
+# TODO: handle file content types
+OpenAIResponseInputMessageContent = Annotated[
+    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
+
+
 @json_schema_type
 class OpenAIResponseOutputMessageContentOutputText(BaseModel):
     text: str
@@ -31,13 +53,22 @@ register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMe
 
 
 @json_schema_type
-class OpenAIResponseOutputMessage(BaseModel):
-    id: str
-    content: list[OpenAIResponseOutputMessageContent]
-    role: Literal["assistant"] = "assistant"
-    status: str
+class OpenAIResponseMessage(BaseModel):
+    """
+    Corresponds to the various Message types in the Responses API.
+    They are all under one type because the Responses API gives them all
+    the same "type" value, and there is no way to tell them apart in certain
+    scenarios.
+    """
+
+    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
+    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
     type: Literal["message"] = "message"
 
+    # The fields below are not used in all scenarios, but are required in others.
+    id: str | None = None
+    status: str | None = None
+
 
 @json_schema_type
 class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
@@ -46,8 +77,18 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
     type: Literal["web_search_call"] = "web_search_call"
 
 
+@json_schema_type
+class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
+    arguments: str
+    call_id: str
+    name: str
+    type: Literal["function_call"] = "function_call"
+    id: str
+    status: str
+
+
 OpenAIResponseOutput = Annotated[
-    OpenAIResponseOutputMessage | OpenAIResponseOutputMessageWebSearchToolCall,
+    OpenAIResponseMessage | OpenAIResponseOutputMessageWebSearchToolCall | OpenAIResponseOutputMessageFunctionToolCall,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
@@ -90,32 +131,29 @@ register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
 
 
 @json_schema_type
-class OpenAIResponseInputMessageContentText(BaseModel):
-    text: str
-    type: Literal["input_text"] = "input_text"
+class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
+    """
+    This represents the output of a function call that gets passed back to the model.
+    """
+
+    call_id: str
+    output: str
+    type: Literal["function_call_output"] = "function_call_output"
+    id: str | None = None
+    status: str | None = None
 
 
-@json_schema_type
-class OpenAIResponseInputMessageContentImage(BaseModel):
-    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
-    type: Literal["input_image"] = "input_image"
-    # TODO: handle file_id
-    image_url: str | None = None
-
-
-# TODO: handle file content types
-OpenAIResponseInputMessageContent = Annotated[
-    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
-    Field(discriminator="type"),
+OpenAIResponseInput = Annotated[
+    # Responses API allows output messages to be passed in as input
+    OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageFunctionToolCall
+    | OpenAIResponseInputFunctionToolCallOutput
+    |
+    # Fallback to the generic message type as a last resort
+    OpenAIResponseMessage,
+    Field(union_mode="left_to_right"),
 ]
-register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
-
-
-@json_schema_type
-class OpenAIResponseInputMessage(BaseModel):
-    content: str | list[OpenAIResponseInputMessageContent]
-    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
-    type: Literal["message"] | None = "message"
+register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
 
 
 @json_schema_type
@@ -126,8 +164,35 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
     # TODO: add user_location
 
 
+@json_schema_type
+class OpenAIResponseInputToolFunction(BaseModel):
+    type: Literal["function"] = "function"
+    name: str
+    description: str | None = None
+    parameters: dict[str, Any] | None
+    strict: bool | None = None
+
+
+class FileSearchRankingOptions(BaseModel):
+    ranker: str | None = None
+    score_threshold: float | None = Field(default=0.0, ge=0.0, le=1.0)
+
+
+@json_schema_type
+class OpenAIResponseInputToolFileSearch(BaseModel):
+    type: Literal["file_search"] = "file_search"
+    vector_store_id: list[str]
+    ranking_options: FileSearchRankingOptions | None = None
+    # TODO: add filters
+
+
 OpenAIResponseInputTool = Annotated[
-    OpenAIResponseInputToolWebSearch,
+    OpenAIResponseInputToolWebSearch | OpenAIResponseInputToolFileSearch | OpenAIResponseInputToolFunction,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
+
+
+class OpenAIResponseInputItemList(BaseModel):
+    data: list[OpenAIResponseInput]
+    object: Literal["list"] = "list"
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 32046d2b1..f4d323607 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -18,6 +18,7 @@ from importlib.metadata import version as parse_version
 from pathlib import Path
 from typing import Annotated, Any
 
+import rich.pretty
 import yaml
 from fastapi import Body, FastAPI, HTTPException, Request
 from fastapi import Path as FastapiPath
@@ -187,11 +188,30 @@ async def sse_generator(event_gen_coroutine):
         )
 
 
+async def log_request_pre_validation(request: Request):
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            body_bytes = await request.body()
+            if body_bytes:
+                try:
+                    parsed_body = json.loads(body_bytes.decode())
+                    log_output = rich.pretty.pretty_repr(parsed_body)
+                except (json.JSONDecodeError, UnicodeDecodeError):
+                    log_output = repr(body_bytes)
+                logger.debug(f"Incoming raw request body for {request.method} {request.url.path}:\n{log_output}")
+            else:
+                logger.debug(f"Incoming {request.method} {request.url.path} request with empty body.")
+        except Exception as e:
+            logger.warning(f"Could not read or log request body for {request.method} {request.url.path}: {e}")
+
+
 def create_dynamic_typed_route(func: Any, method: str, route: str):
     async def endpoint(request: Request, **kwargs):
         # Get auth attributes from the request scope
         user_attributes = request.scope.get("user_attributes", {})
 
+        await log_request_pre_validation(request)
+
         # Use context manager with both provider data and auth attributes
         with request_provider_data_context(request.headers, user_attributes):
             is_streaming = is_streaming_request(func.__name__, request, **kwargs)
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index db10972de..86780fd61 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -20,7 +20,7 @@ from llama_stack.apis.agents import (
     AgentTurnCreateRequest,
     AgentTurnResumeRequest,
     Document,
-    OpenAIResponseInputMessage,
+    OpenAIResponseInput,
     OpenAIResponseInputTool,
     OpenAIResponseObject,
     Session,
@@ -311,7 +311,7 @@ class MetaReferenceAgentsImpl(Agents):
 
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInputMessage],
+        input: str | list[OpenAIResponseInput],
         model: str,
         previous_response_id: str | None = None,
         store: bool | None = True,
diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
index 24a99dd6e..3ead083ef 100644
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -10,19 +10,26 @@ from collections.abc import AsyncIterator
 from typing import cast
 
 from openai.types.chat import ChatCompletionToolParam
+from pydantic import BaseModel
 
 from llama_stack.apis.agents.openai_responses import (
-    OpenAIResponseInputMessage,
+    OpenAIResponseInput,
+    OpenAIResponseInputFunctionToolCallOutput,
+    OpenAIResponseInputItemList,
+    OpenAIResponseInputMessageContent,
     OpenAIResponseInputMessageContentImage,
     OpenAIResponseInputMessageContentText,
     OpenAIResponseInputTool,
+    OpenAIResponseInputToolFunction,
+    OpenAIResponseMessage,
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
     OpenAIResponseObjectStreamResponseCompleted,
     OpenAIResponseObjectStreamResponseCreated,
     OpenAIResponseOutput,
-    OpenAIResponseOutputMessage,
+    OpenAIResponseOutputMessageContent,
     OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageFunctionToolCall,
     OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.inference.inference import (
@@ -32,10 +39,13 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletionContentPartImageParam,
     OpenAIChatCompletionContentPartParam,
     OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
     OpenAIChatCompletionToolCallFunction,
     OpenAIChoice,
+    OpenAIDeveloperMessageParam,
     OpenAIImageURL,
     OpenAIMessageParam,
+    OpenAISystemMessageParam,
     OpenAIToolMessageParam,
     OpenAIUserMessageParam,
 )
@@ -50,31 +60,110 @@ logger = get_logger(name=__name__, category="openai_responses")
 OPENAI_RESPONSES_PREFIX = "openai_responses:"
 
 
-async def _previous_response_to_messages(previous_response: OpenAIResponseObject) -> list[OpenAIMessageParam]:
+async def _convert_response_content_to_chat_content(
+    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent],
+) -> str | list[OpenAIChatCompletionContentPartParam]:
+    """
+    Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts.
+
+    The content schemas of each API look similar, but are not exactly the same.
+    """
+    if isinstance(content, str):
+        return content
+
+    converted_parts = []
+    for content_part in content:
+        if isinstance(content_part, OpenAIResponseInputMessageContentText):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
+        elif isinstance(content_part, OpenAIResponseOutputMessageContentOutputText):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
+        elif isinstance(content_part, OpenAIResponseInputMessageContentImage):
+            if content_part.image_url:
+                image_url = OpenAIImageURL(url=content_part.image_url, detail=content_part.detail)
+                converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
+        elif isinstance(content_part, str):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part))
+        else:
+            raise ValueError(
+                f"Llama Stack OpenAI Responses does not yet support content type '{type(content_part)}' in this context"
+            )
+    return converted_parts
+
+
+async def _convert_response_input_to_chat_messages(
+    input: str | list[OpenAIResponseInput],
+) -> list[OpenAIMessageParam]:
+    """
+    Convert the input from an OpenAI Response API request into OpenAI Chat Completion messages.
+    """
     messages: list[OpenAIMessageParam] = []
-    for output_message in previous_response.output:
-        if isinstance(output_message, OpenAIResponseOutputMessage):
-            messages.append(OpenAIAssistantMessageParam(content=output_message.content[0].text))
+    if isinstance(input, list):
+        for input_item in input:
+            if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput):
+                messages.append(
+                    OpenAIToolMessageParam(
+                        content=input_item.output,
+                        tool_call_id=input_item.call_id,
+                    )
+                )
+            elif isinstance(input_item, OpenAIResponseOutputMessageFunctionToolCall):
+                tool_call = OpenAIChatCompletionToolCall(
+                    index=0,
+                    id=input_item.call_id,
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=input_item.name,
+                        arguments=input_item.arguments,
+                    ),
+                )
+                messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call]))
+            else:
+                content = await _convert_response_content_to_chat_content(input_item.content)
+                message_type = await _get_message_type_by_role(input_item.role)
+                if message_type is None:
+                    raise ValueError(
+                        f"Llama Stack OpenAI Responses does not yet support message role '{input_item.role}' in this context"
+                    )
+                messages.append(message_type(content=content))
+    else:
+        messages.append(OpenAIUserMessageParam(content=input))
     return messages
 
 
-async def _openai_choices_to_output_messages(choices: list[OpenAIChoice]) -> list[OpenAIResponseOutputMessage]:
-    output_messages = []
-    for choice in choices:
-        output_content = ""
-        if isinstance(choice.message.content, str):
-            output_content = choice.message.content
-        elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
-            output_content = choice.message.content.text
-        # TODO: handle image content
-        output_messages.append(
-            OpenAIResponseOutputMessage(
-                id=f"msg_{uuid.uuid4()}",
-                content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
-                status="completed",
-            )
+async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenAIResponseMessage:
+    """
+    Convert an OpenAI Chat Completion choice into an OpenAI Response output message.
+    """
+    output_content = ""
+    if isinstance(choice.message.content, str):
+        output_content = choice.message.content
+    elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
+        output_content = choice.message.content.text
+    else:
+        raise ValueError(
+            f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}"
         )
-    return output_messages
+
+    return OpenAIResponseMessage(
+        id=f"msg_{uuid.uuid4()}",
+        content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
+        status="completed",
+        role="assistant",
+    )
+
+
+async def _get_message_type_by_role(role: str):
+    role_to_type = {
+        "user": OpenAIUserMessageParam,
+        "system": OpenAISystemMessageParam,
+        "assistant": OpenAIAssistantMessageParam,
+        "developer": OpenAIDeveloperMessageParam,
+    }
+    return role_to_type.get(role)
+
+
+class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
+    input_items: OpenAIResponseInputItemList
+    response: OpenAIResponseObject
 
 
 class OpenAIResponsesImpl:
@@ -90,19 +179,45 @@ class OpenAIResponsesImpl:
         self.tool_groups_api = tool_groups_api
         self.tool_runtime_api = tool_runtime_api
 
-    async def get_openai_response(
-        self,
-        id: str,
-    ) -> OpenAIResponseObject:
+    async def _get_previous_response_with_input(self, id: str) -> OpenAIResponsePreviousResponseWithInputItems:
         key = f"{OPENAI_RESPONSES_PREFIX}{id}"
         response_json = await self.persistence_store.get(key=key)
         if response_json is None:
             raise ValueError(f"OpenAI response with id '{id}' not found")
-        return OpenAIResponseObject.model_validate_json(response_json)
+        return OpenAIResponsePreviousResponseWithInputItems.model_validate_json(response_json)
+
+    async def _prepend_previous_response(
+        self, input: str | list[OpenAIResponseInput], previous_response_id: str | None = None
+    ):
+        if previous_response_id:
+            previous_response_with_input = await self._get_previous_response_with_input(previous_response_id)
+
+            # previous response input items
+            new_input_items = previous_response_with_input.input_items.data
+
+            # previous response output items
+            new_input_items.extend(previous_response_with_input.response.output)
+
+            # new input items from the current request
+            if isinstance(input, str):
+                new_input_items.append(OpenAIResponseMessage(content=input, role="user"))
+            else:
+                new_input_items.extend(input)
+
+            input = new_input_items
+
+        return input
+
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        response_with_input = await self._get_previous_response_with_input(id)
+        return response_with_input.response
 
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInputMessage],
+        input: str | list[OpenAIResponseInput],
         model: str,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -112,31 +227,8 @@ class OpenAIResponsesImpl:
     ):
         stream = False if stream is None else stream
 
-        messages: list[OpenAIMessageParam] = []
-        if previous_response_id:
-            previous_response = await self.get_openai_response(previous_response_id)
-            messages.extend(await _previous_response_to_messages(previous_response))
-        # TODO: refactor this user_content parsing out into a separate method
-        user_content: str | list[OpenAIChatCompletionContentPartParam] = ""
-        if isinstance(input, list):
-            user_content = []
-            for user_input in input:
-                if isinstance(user_input.content, list):
-                    for user_input_content in user_input.content:
-                        if isinstance(user_input_content, OpenAIResponseInputMessageContentText):
-                            user_content.append(OpenAIChatCompletionContentPartTextParam(text=user_input_content.text))
-                        elif isinstance(user_input_content, OpenAIResponseInputMessageContentImage):
-                            if user_input_content.image_url:
-                                image_url = OpenAIImageURL(
-                                    url=user_input_content.image_url, detail=user_input_content.detail
-                                )
-                                user_content.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
-                else:
-                    user_content.append(OpenAIChatCompletionContentPartTextParam(text=user_input.content))
-        else:
-            user_content = input
-        messages.append(OpenAIUserMessageParam(content=user_content))
-
+        input = await self._prepend_previous_response(input, previous_response_id)
+        messages = await _convert_response_input_to_chat_messages(input)
         chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
         chat_response = await self.inference_api.openai_chat_completion(
             model=model,
@@ -150,6 +242,7 @@ class OpenAIResponsesImpl:
             # TODO: refactor this into a separate method that handles streaming
             chat_response_id = ""
             chat_response_content = []
+            chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
             # TODO: these chunk_ fields are hacky and only take the last chunk into account
             chunk_created = 0
             chunk_model = ""
@@ -163,7 +256,26 @@ class OpenAIResponsesImpl:
                     chat_response_content.append(chunk_choice.delta.content or "")
                     if chunk_choice.finish_reason:
                         chunk_finish_reason = chunk_choice.finish_reason
-            assistant_message = OpenAIAssistantMessageParam(content="".join(chat_response_content))
+
+                    # Aggregate tool call arguments across chunks, using their index as the aggregation key
+                    if chunk_choice.delta.tool_calls:
+                        for tool_call in chunk_choice.delta.tool_calls:
+                            response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
+                            if response_tool_call:
+                                response_tool_call.function.arguments += tool_call.function.arguments
+                            else:
+                                response_tool_call = OpenAIChatCompletionToolCall(**tool_call.model_dump())
+                            chat_response_tool_calls[tool_call.index] = response_tool_call
+
+            # Convert the dict of tool calls by index to a list of tool calls to pass back in our response
+            if chat_response_tool_calls:
+                tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
+            else:
+                tool_calls = None
+            assistant_message = OpenAIAssistantMessageParam(
+                content="".join(chat_response_content),
+                tool_calls=tool_calls,
+            )
             chat_response = OpenAIChatCompletion(
                 id=chat_response_id,
                 choices=[
@@ -181,12 +293,26 @@ class OpenAIResponsesImpl:
             chat_response = OpenAIChatCompletion(**chat_response.model_dump())
 
         output_messages: list[OpenAIResponseOutput] = []
-        if chat_response.choices[0].message.tool_calls:
-            output_messages.extend(
-                await self._execute_tool_and_return_final_output(model, stream, chat_response, messages, temperature)
-            )
-        else:
-            output_messages.extend(await _openai_choices_to_output_messages(chat_response.choices))
+        for choice in chat_response.choices:
+            if choice.message.tool_calls and tools:
+                # Assume if the first tool is a function, all tools are functions
+                if isinstance(tools[0], OpenAIResponseInputToolFunction):
+                    for tool_call in choice.message.tool_calls:
+                        output_messages.append(
+                            OpenAIResponseOutputMessageFunctionToolCall(
+                                arguments=tool_call.function.arguments or "",
+                                call_id=tool_call.id,
+                                name=tool_call.function.name or "",
+                                id=f"fc_{uuid.uuid4()}",
+                                status="completed",
+                            )
+                        )
+                else:
+                    output_messages.extend(
+                        await self._execute_tool_and_return_final_output(model, stream, choice, messages, temperature)
+                    )
+            else:
+                output_messages.append(await _convert_chat_choice_to_response_message(choice))
         response = OpenAIResponseObject(
             created_at=chat_response.created,
             id=f"resp-{uuid.uuid4()}",
@@ -195,13 +321,43 @@ class OpenAIResponsesImpl:
             status="completed",
             output=output_messages,
         )
+        logger.debug(f"OpenAI Responses response: {response}")
 
         if store:
             # Store in kvstore
+
+            new_input_id = f"msg_{uuid.uuid4()}"
+            if isinstance(input, str):
+                # synthesize a message from the input string
+                input_content = OpenAIResponseInputMessageContentText(text=input)
+                input_content_item = OpenAIResponseMessage(
+                    role="user",
+                    content=[input_content],
+                    id=new_input_id,
+                )
+                input_items_data = [input_content_item]
+            else:
+                # we already have a list of messages
+                input_items_data = []
+                for input_item in input:
+                    if isinstance(input_item, OpenAIResponseMessage):
+                        # These may or may not already have an id, so dump to dict, check for id, and add if missing
+                        input_item_dict = input_item.model_dump()
+                        if "id" not in input_item_dict:
+                            input_item_dict["id"] = new_input_id
+                        input_items_data.append(OpenAIResponseMessage(**input_item_dict))
+                    else:
+                        input_items_data.append(input_item)
+
+            input_items = OpenAIResponseInputItemList(data=input_items_data)
+            prev_response = OpenAIResponsePreviousResponseWithInputItems(
+                input_items=input_items,
+                response=response,
+            )
             key = f"{OPENAI_RESPONSES_PREFIX}{response.id}"
             await self.persistence_store.set(
                 key=key,
-                value=response.model_dump_json(),
+                value=prev_response.model_dump_json(),
             )
 
         if stream:
@@ -221,7 +377,9 @@ class OpenAIResponsesImpl:
         chat_tools: list[ChatCompletionToolParam] = []
         for input_tool in tools:
             # TODO: Handle other tool types
-            if input_tool.type == "web_search":
+            if input_tool.type == "function":
+                chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump()))
+            elif input_tool.type == "web_search":
                 tool_name = "web_search"
                 tool = await self.tool_groups_api.get_tool(tool_name)
                 tool_def = ToolDefinition(
@@ -247,12 +405,11 @@ class OpenAIResponsesImpl:
         self,
         model_id: str,
         stream: bool,
-        chat_response: OpenAIChatCompletion,
+        choice: OpenAIChoice,
         messages: list[OpenAIMessageParam],
         temperature: float,
     ) -> list[OpenAIResponseOutput]:
         output_messages: list[OpenAIResponseOutput] = []
-        choice = chat_response.choices[0]
 
         # If the choice is not an assistant message, we don't need to execute any tools
         if not isinstance(choice.message, OpenAIAssistantMessageParam):
@@ -262,6 +419,9 @@ class OpenAIResponsesImpl:
         if not choice.message.tool_calls:
             return output_messages
 
+        # Copy the messages list to avoid mutating the original list
+        messages = messages.copy()
+
         # Add the assistant message with tool_calls response to the messages list
         messages.append(choice.message)
 
@@ -307,7 +467,9 @@ class OpenAIResponsesImpl:
         )
         # type cast to appease mypy
         tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
-        tool_final_outputs = await _openai_choices_to_output_messages(tool_results_chat_response.choices)
+        tool_final_outputs = [
+            await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
+        ]
         # TODO: Wire in annotations with URLs, titles, etc to these output messages
         output_messages.extend(tool_final_outputs)
         return output_messages
diff --git a/tests/unit/providers/agents/meta_reference/fixtures/__init__.py b/tests/unit/providers/agents/meta_reference/fixtures/__init__.py
new file mode 100644
index 000000000..e112bb6e5
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/fixtures/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+import yaml
+
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+)
+
+FIXTURES_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def load_chat_completion_fixture(filename: str) -> OpenAIChatCompletion:
+    fixture_path = os.path.join(FIXTURES_DIR, filename)
+
+    with open(fixture_path) as f:
+        data = yaml.safe_load(f)
+    return OpenAIChatCompletion(**data)
diff --git a/tests/unit/providers/agents/meta_reference/fixtures/simple_chat_completion.yaml b/tests/unit/providers/agents/meta_reference/fixtures/simple_chat_completion.yaml
new file mode 100644
index 000000000..4959349a0
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/fixtures/simple_chat_completion.yaml
@@ -0,0 +1,9 @@
+id: chat-completion-123
+choices:
+  - message:
+      content: "Dublin"
+      role: assistant
+    finish_reason: stop
+    index: 0
+created: 1234567890
+model: meta-llama/Llama-3.1-8B-Instruct
diff --git a/tests/unit/providers/agents/meta_reference/fixtures/tool_call_completion.yaml b/tests/unit/providers/agents/meta_reference/fixtures/tool_call_completion.yaml
new file mode 100644
index 000000000..f6532e3a9
--- /dev/null
+++ b/tests/unit/providers/agents/meta_reference/fixtures/tool_call_completion.yaml
@@ -0,0 +1,14 @@
+id: chat-completion-123
+choices:
+  - message:
+      tool_calls:
+        - id: tool_call_123
+          type: function
+          function:
+            name: web_search
+            arguments: '{"query":"What is the capital of Ireland?"}'
+      role: assistant
+    finish_reason: stop
+    index: 0
+created: 1234567890
+model: meta-llama/Llama-3.1-8B-Instruct
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index cd6b0fc55..3fe68cb9d 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -4,27 +4,32 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from unittest.mock import AsyncMock
+from unittest.mock import AsyncMock, patch
 
 import pytest
 
 from llama_stack.apis.agents.openai_responses import (
+    OpenAIResponseInputItemList,
+    OpenAIResponseInputMessageContentText,
     OpenAIResponseInputToolWebSearch,
-    OpenAIResponseOutputMessage,
+    OpenAIResponseMessage,
+    OpenAIResponseObject,
+    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.inference.inference import (
     OpenAIAssistantMessageParam,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionToolCall,
-    OpenAIChatCompletionToolCallFunction,
-    OpenAIChoice,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIDeveloperMessageParam,
     OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.providers.inline.agents.meta_reference.openai_responses import (
+    OpenAIResponsePreviousResponseWithInputItems,
     OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.kvstore import KVStore
+from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
 
 
 @pytest.fixture
@@ -65,21 +70,11 @@ def openai_responses_impl(mock_kvstore, mock_inference_api, mock_tool_groups_api
 async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with a simple string input."""
     # Setup
-    input_text = "Hello, world!"
+    input_text = "What is the capital of Ireland?"
     model = "meta-llama/Llama-3.1-8B-Instruct"
 
-    mock_chat_completion = OpenAIChatCompletion(
-        id="chat-completion-123",
-        choices=[
-            OpenAIChoice(
-                message=OpenAIAssistantMessageParam(content="Hello! How can I help you?"),
-                finish_reason="stop",
-                index=0,
-            )
-        ],
-        created=1234567890,
-        model=model,
-    )
+    # Load the chat completion fixture
+    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
     mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
 
     # Execute
@@ -92,7 +87,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     # Verify
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
-        messages=[OpenAIUserMessageParam(role="user", content="Hello, world!", name=None)],
+        messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
         tools=None,
         stream=False,
         temperature=0.1,
@@ -100,55 +95,25 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     openai_responses_impl.persistence_store.set.assert_called_once()
     assert result.model == model
     assert len(result.output) == 1
-    assert isinstance(result.output[0], OpenAIResponseOutputMessage)
-    assert result.output[0].content[0].text == "Hello! How can I help you?"
+    assert isinstance(result.output[0], OpenAIResponseMessage)
+    assert result.output[0].content[0].text == "Dublin"
 
 
 @pytest.mark.asyncio
 async def test_create_openai_response_with_string_input_with_tools(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with a simple string input and tools."""
     # Setup
-    input_text = "What was the score of todays game?"
+    input_text = "What is the capital of Ireland?"
     model = "meta-llama/Llama-3.1-8B-Instruct"
 
-    mock_chat_completions = [
-        OpenAIChatCompletion(
-            id="chat-completion-123",
-            choices=[
-                OpenAIChoice(
-                    message=OpenAIAssistantMessageParam(
-                        tool_calls=[
-                            OpenAIChatCompletionToolCall(
-                                id="tool_call_123",
-                                type="function",
-                                function=OpenAIChatCompletionToolCallFunction(
-                                    name="web_search", arguments='{"query":"What was the score of todays game?"}'
-                                ),
-                            )
-                        ],
-                    ),
-                    finish_reason="stop",
-                    index=0,
-                )
-            ],
-            created=1234567890,
-            model=model,
-        ),
-        OpenAIChatCompletion(
-            id="chat-completion-123",
-            choices=[
-                OpenAIChoice(
-                    message=OpenAIAssistantMessageParam(content="The score of todays game was 10-12"),
-                    finish_reason="stop",
-                    index=0,
-                )
-            ],
-            created=1234567890,
-            model=model,
-        ),
-    ]
+    # Load the chat completion fixtures
+    tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml")
+    tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
 
-    mock_inference_api.openai_chat_completion.side_effect = mock_chat_completions
+    mock_inference_api.openai_chat_completion.side_effect = [
+        tool_call_completion,
+        tool_response_completion,
+    ]
 
     openai_responses_impl.tool_groups_api.get_tool.return_value = Tool(
         identifier="web_search",
@@ -163,7 +128,7 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
 
     openai_responses_impl.tool_runtime_api.invoke_tool.return_value = ToolInvocationResult(
         status="completed",
-        content="The score of todays game was 10-12",
+        content="Dublin",
     )
 
     # Execute
@@ -180,23 +145,172 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
 
     # Verify
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
-    assert first_call.kwargs["messages"][0].content == "What was the score of todays game?"
+    assert first_call.kwargs["messages"][0].content == "What is the capital of Ireland?"
     assert first_call.kwargs["tools"] is not None
     assert first_call.kwargs["temperature"] == 0.1
 
     second_call = mock_inference_api.openai_chat_completion.call_args_list[1]
-    assert second_call.kwargs["messages"][-1].content == "The score of todays game was 10-12"
+    assert second_call.kwargs["messages"][-1].content == "Dublin"
     assert second_call.kwargs["temperature"] == 0.1
 
     openai_responses_impl.tool_groups_api.get_tool.assert_called_once_with("web_search")
     openai_responses_impl.tool_runtime_api.invoke_tool.assert_called_once_with(
         tool_name="web_search",
-        kwargs={"query": "What was the score of todays game?"},
+        kwargs={"query": "What is the capital of Ireland?"},
     )
 
     openai_responses_impl.persistence_store.set.assert_called_once()
 
     # Check that we got the content from our mocked tool execution result
     assert len(result.output) >= 1
-    assert isinstance(result.output[1], OpenAIResponseOutputMessage)
-    assert result.output[1].content[0].text == "The score of todays game was 10-12"
+    assert isinstance(result.output[1], OpenAIResponseMessage)
+    assert result.output[1].content[0].text == "Dublin"
+
+
+@pytest.mark.asyncio
+async def test_create_openai_response_with_multiple_messages(openai_responses_impl, mock_inference_api):
+    """Test creating an OpenAI response with multiple messages."""
+    # Setup
+    input_messages = [
+        OpenAIResponseMessage(role="developer", content="You are a helpful assistant", name=None),
+        OpenAIResponseMessage(role="user", content="Name some towns in Ireland", name=None),
+        OpenAIResponseMessage(
+            role="assistant",
+            content=[
+                OpenAIResponseInputMessageContentText(text="Galway, Longford, Sligo"),
+                OpenAIResponseInputMessageContentText(text="Dublin"),
+            ],
+            name=None,
+        ),
+        OpenAIResponseMessage(role="user", content="Which is the largest town in Ireland?", name=None),
+    ]
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+
+    mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml")
+
+    # Execute
+    await openai_responses_impl.create_openai_response(
+        input=input_messages,
+        model=model,
+        temperature=0.1,
+    )
+
+    # Verify the the correct messages were sent to the inference API i.e.
+    # All of the responses message were convered to the chat completion message objects
+    inference_messages = mock_inference_api.openai_chat_completion.call_args_list[0].kwargs["messages"]
+    for i, m in enumerate(input_messages):
+        if isinstance(m.content, str):
+            assert inference_messages[i].content == m.content
+        else:
+            assert inference_messages[i].content[0].text == m.content[0].text
+            assert isinstance(inference_messages[i].content[0], OpenAIChatCompletionContentPartTextParam)
+        assert inference_messages[i].role == m.role
+        if m.role == "user":
+            assert isinstance(inference_messages[i], OpenAIUserMessageParam)
+        elif m.role == "assistant":
+            assert isinstance(inference_messages[i], OpenAIAssistantMessageParam)
+        else:
+            assert isinstance(inference_messages[i], OpenAIDeveloperMessageParam)
+
+
+@pytest.mark.asyncio
+async def test_prepend_previous_response_none(openai_responses_impl):
+    """Test prepending no previous response to a new response."""
+
+    input = await openai_responses_impl._prepend_previous_response("fake_input", None)
+    assert input == "fake_input"
+
+
+@pytest.mark.asyncio
+@patch.object(OpenAIResponsesImpl, "_get_previous_response_with_input")
+async def test_prepend_previous_response_basic(get_previous_response_with_input, openai_responses_impl):
+    """Test prepending a basic previous response to a new response."""
+
+    input_item_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseInputMessageContentText(text="fake_previous_input")],
+        role="user",
+    )
+    input_items = OpenAIResponseInputItemList(data=[input_item_message])
+    response_output_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseOutputMessageContentOutputText(text="fake_response")],
+        status="completed",
+        role="assistant",
+    )
+    response = OpenAIResponseObject(
+        created_at=1,
+        id="resp_123",
+        model="fake_model",
+        output=[response_output_message],
+        status="completed",
+    )
+    previous_response = OpenAIResponsePreviousResponseWithInputItems(
+        input_items=input_items,
+        response=response,
+    )
+    get_previous_response_with_input.return_value = previous_response
+
+    input = await openai_responses_impl._prepend_previous_response("fake_input", "resp_123")
+
+    assert len(input) == 3
+    # Check for previous input
+    assert isinstance(input[0], OpenAIResponseMessage)
+    assert input[0].content[0].text == "fake_previous_input"
+    # Check for previous output
+    assert isinstance(input[1], OpenAIResponseMessage)
+    assert input[1].content[0].text == "fake_response"
+    # Check for new input
+    assert isinstance(input[2], OpenAIResponseMessage)
+    assert input[2].content == "fake_input"
+
+
+@pytest.mark.asyncio
+@patch.object(OpenAIResponsesImpl, "_get_previous_response_with_input")
+async def test_prepend_previous_response_web_search(get_previous_response_with_input, openai_responses_impl):
+    """Test prepending a web search previous response to a new response."""
+
+    input_item_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseInputMessageContentText(text="fake_previous_input")],
+        role="user",
+    )
+    input_items = OpenAIResponseInputItemList(data=[input_item_message])
+    output_web_search = OpenAIResponseOutputMessageWebSearchToolCall(
+        id="ws_123",
+        status="completed",
+    )
+    output_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseOutputMessageContentOutputText(text="fake_web_search_response")],
+        status="completed",
+        role="assistant",
+    )
+    response = OpenAIResponseObject(
+        created_at=1,
+        id="resp_123",
+        model="fake_model",
+        output=[output_web_search, output_message],
+        status="completed",
+    )
+    previous_response = OpenAIResponsePreviousResponseWithInputItems(
+        input_items=input_items,
+        response=response,
+    )
+    get_previous_response_with_input.return_value = previous_response
+
+    input_messages = [OpenAIResponseMessage(content="fake_input", role="user")]
+    input = await openai_responses_impl._prepend_previous_response(input_messages, "resp_123")
+
+    assert len(input) == 4
+    # Check for previous input
+    assert isinstance(input[0], OpenAIResponseMessage)
+    assert input[0].content[0].text == "fake_previous_input"
+    # Check for previous output web search tool call
+    assert isinstance(input[1], OpenAIResponseOutputMessageWebSearchToolCall)
+    # Check for previous output web search response
+    assert isinstance(input[2], OpenAIResponseMessage)
+    assert input[2].content[0].text == "fake_web_search_response"
+    # Check for new input
+    assert isinstance(input[3], OpenAIResponseMessage)
+    assert input[3].content == "fake_input"
diff --git a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
index f235b2ea8..ed5f571e8 100644
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@@ -31,6 +31,26 @@ test_response_web_search:
         search_context_size: "low"
       output: "128"
 
+test_response_custom_tool:
+  test_name: test_response_custom_tool
+  test_params:
+    case:
+    - case_id: "sf_weather"
+      input: "What's the weather like in San Francisco?"
+      tools:
+      - type: function
+        name: get_weather
+        description: Get current temperature for a given location.
+        parameters:
+          additionalProperties: false
+          properties:
+            location:
+              description: "City and country e.g. Bogot\xE1, Colombia"
+              type: string
+          required:
+          - location
+          type: object
+
 test_response_image:
   test_name: test_response_image
   test_params:
diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py
index cc7ec320c..e279b9b38 100644
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@@ -124,6 +124,28 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
     assert case["output"].lower() in response.output_text.lower().strip()
 
 
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_custom_tool"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_custom_tool(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        tools=case["tools"],
+        stream=False,
+    )
+    assert len(response.output) == 1
+    assert response.output[0].type == "function_call"
+    assert response.output[0].status == "completed"
+    assert response.output[0].name == "get_weather"
+
+
 @pytest.mark.parametrize(
     "case",
     responses_test_cases["test_response_image"]["test_params"]["case"],