From 1261bc93bf91e648b19ad8705d04c2d77614d780 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Mon, 17 Mar 2025 20:22:26 -0400
Subject: [PATCH 01/16] docs: fixed broken tip in distro build docs (#1673)

# What does this PR do?
fixed broken tip in distro build docs

## Test Plan
Local docs build

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 docs/source/distributions/building_distro.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 37a7e7974..9b8c1b9ad 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -185,8 +185,12 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
 :::
 
 :::{tab-item} Building Container
-> [!TIP]
-> Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
+
+```{admonition} Podman Alternative
+:class: tip
+
+Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
+```
 
 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
 

From e14f69eb7e300ce75e5d7fd71a9eb0724d54fd9c Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 17 Mar 2025 23:19:37 -0400
Subject: [PATCH 02/16] chore: Remove unused cursor rules (#1653)

# What does this PR do?

I think this was included accidentally via
https://github.com/meta-llama/llama-stack/pull/1475.

@raghotham @ashwinb let me know if it's intentional to include this.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 .cursor/rules/general.mdc | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 .cursor/rules/general.mdc

diff --git a/.cursor/rules/general.mdc b/.cursor/rules/general.mdc
deleted file mode 100644
index 24daef2ba..000000000
--- a/.cursor/rules/general.mdc
+++ /dev/null
@@ -1,9 +0,0 @@
----
-description: General rules always applicable across the project
-globs:
-alwaysApply: true
----
-# Style
-
-- Comments must add value to code. Don't write filler comments explaining what you are doing next; they just add noise.
-- Add a comment to clarify surprising behavior which would not be obvious. Good variable naming and clear code organization is more important.

From 2d2bb701fab562e10ee3410b3bedfcec8edb7505 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 17 Mar 2025 23:20:31 -0400
Subject: [PATCH 03/16] ci: Add dependabot scans for Python deps (#1618)

# What does this PR do?

This PR adds dependabot updates for Python dependencies. In addition:
* Consistent weekly schedule on a specific day
* Specific commit messages
* `open-pull-requests-limit` is intentional to avoid upgrading
dependencies that will likely cause regressions. We want to keep the
focus here on security updates only

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 .github/dependabot.yml | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 4aba604dd..d68af5615 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,4 +5,19 @@ updates:
   - package-ecosystem: "github-actions"
     directory: "/" # Will use the default workflow location of `.github/workflows`
     schedule:
-      interval: "daily"
+      interval: "weekly"
+      day: "saturday"
+    commit-message:
+      prefix: chore(github-deps)
+  - package-ecosystem: "uv"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "saturday"
+    # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
+    open-pull-requests-limit: 0
+    labels:
+      - type/dependencies
+      - python
+    commit-message:
+      prefix: chore(python-deps)

From 0bdfc71f8da402c2cdbaa96ac91de2afd71b617e Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 18 Mar 2025 00:33:04 -0400
Subject: [PATCH 04/16] test: Bump slow_callback_duration to 200ms to avoid
 flaky remote vLLM unit tests (#1675)

# What does this PR do?

This avoids flaky timeout issue observed in CI builds, e.g.
https://github.com/meta-llama/llama-stack/actions/runs/13907179329/job/38912865968?pr=1273

## Test Plan

Ran multiple times and pass consistently.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 tests/unit/providers/inference/test_remote_vllm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index 3afe1389e..cb0997e1a 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -187,8 +187,8 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
     loop.set_debug(True)
     caplog.set_level(logging.WARNING)
 
-    # Log when event loop is blocked for more than 100ms
-    loop.slow_callback_duration = 0.1
+    # Log when event loop is blocked for more than 200ms
+    loop.slow_callback_duration = 0.2
     # Sleep for 500ms in our delayed http response
     sleep_time = 0.5
 

From c23a7af5d6538e7c2ad1dc5ba51e67871b3e5e35 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Mon, 17 Mar 2025 22:11:06 -0700
Subject: [PATCH 05/16] fix: agents with non-llama model (#1550)

# Summary:
Includes fixes to get test_agents working with openAI model, e.g. tool
parsing and message conversion

# Test Plan:
```
LLAMA_STACK_CONFIG=dev pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model openai/gpt-4o-mini
```

---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with
[ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1550).
* #1556
* __->__ #1550
---
 .../agents/meta_reference/agent_instance.py   |  23 +--
 .../utils/inference/litellm_openai_mixin.py   |   6 +-
 .../utils/inference/openai_compat.py          | 184 ++++++++++--------
 tests/integration/agents/test_agents.py       |  73 ++++---
 4 files changed, 161 insertions(+), 125 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 03692bcc7..aa27f421c 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -891,16 +891,14 @@ class ChatAgent(ShieldRunnerMixin):
         if memory_tool and code_interpreter_tool:
             # if both memory and code_interpreter are available, we download the URLs
             # and attach the data to the last message.
-            msg = await attachment_message(self.tempdir, url_items)
-            input_messages.append(msg)
+            await attachment_message(self.tempdir, url_items, input_messages[-1])
             # Since memory is present, add all the data to the memory bank
             await self.add_to_session_vector_db(session_id, documents)
         elif code_interpreter_tool:
             # if only code_interpreter is available, we download the URLs to a tempdir
             # and attach the path to them as a message to inference with the
             # assumption that the model invokes the code_interpreter tool with the path
-            msg = await attachment_message(self.tempdir, url_items)
-            input_messages.append(msg)
+            await attachment_message(self.tempdir, url_items, input_messages[-1])
         elif memory_tool:
             # if only memory is available, we load the data from the URLs and content items to the memory bank
             await self.add_to_session_vector_db(session_id, documents)
@@ -967,8 +965,8 @@ async def load_data_from_urls(urls: List[URL]) -> List[str]:
     return data
 
 
-async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessage:
-    content = []
+async def attachment_message(tempdir: str, urls: List[URL], message: UserMessage) -> None:
+    contents = []
 
     for url in urls:
         uri = url.uri
@@ -988,16 +986,19 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa
         else:
             raise ValueError(f"Unsupported URL {url}")
 
-        content.append(
+        contents.append(
             TextContentItem(
                 text=f'# User provided a file accessible to you at "{filepath}"\nYou can use code_interpreter to load and inspect it.'
             )
         )
 
-    return ToolResponseMessage(
-        call_id="",
-        content=content,
-    )
+    if isinstance(message.content, list):
+        message.content.extend(contents)
+    else:
+        if isinstance(message.content, str):
+            message.content = [TextContentItem(text=message.content)] + contents
+        else:
+            message.content = [message.content] + contents
 
 
 def _interpret_content_as_attachment(
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index d88dc5a9e..f99883990 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -192,7 +192,11 @@ class LiteLLMOpenAIMixin(
         if request.tools:
             input_dict["tools"] = [convert_tooldef_to_openai_tool(tool) for tool in request.tools]
             if request.tool_config.tool_choice:
-                input_dict["tool_choice"] = request.tool_config.tool_choice.value
+                input_dict["tool_choice"] = (
+                    request.tool_config.tool_choice.value
+                    if isinstance(request.tool_config.tool_choice, ToolChoice)
+                    else request.tool_config.tool_choice
+                )
 
         provider_data = self.get_request_provider_data()
         key_field = self.provider_data_api_key_field
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index ac37171c9..2a362f8cb 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -527,26 +527,30 @@ async def convert_message_to_openai_dict_new(
     async def _convert_message_content(
         content: InterleavedContent,
     ) -> Union[str, Iterable[OpenAIChatCompletionContentPartParam]]:
-        async def impl():
+        async def impl(
+            content_: InterleavedContent,
+        ) -> Union[str, OpenAIChatCompletionContentPartParam, List[OpenAIChatCompletionContentPartParam]]:
             # Llama Stack and OpenAI spec match for str and text input
-            if isinstance(content, str):
-                return content
-            elif isinstance(content, TextContentItem):
+            if isinstance(content_, str):
+                return content_
+            elif isinstance(content_, TextContentItem):
                 return OpenAIChatCompletionContentPartTextParam(
                     type="text",
-                    text=content.text,
+                    text=content_.text,
                 )
-            elif isinstance(content, ImageContentItem):
+            elif isinstance(content_, ImageContentItem):
                 return OpenAIChatCompletionContentPartImageParam(
                     type="image_url",
-                    image_url=OpenAIImageURL(url=await convert_image_content_to_url(content)),
+                    image_url=OpenAIImageURL(url=await convert_image_content_to_url(content_)),
                 )
-            elif isinstance(content, list):
-                return [await _convert_message_content(item) for item in content]
+            elif isinstance(content_, list):
+                return [await impl(item) for item in content_]
             else:
-                raise ValueError(f"Unsupported content type: {type(content)}")
+                raise ValueError(f"Unsupported content type: {type(content_)}")
 
-        ret = await impl()
+        ret = await impl(content)
+
+        # OpenAI*Message expects a str or list
         if isinstance(ret, str) or isinstance(ret, list):
             return ret
         else:
@@ -566,13 +570,14 @@ async def convert_message_to_openai_dict_new(
                 OpenAIChatCompletionMessageToolCall(
                     id=tool.call_id,
                     function=OpenAIFunction(
-                        name=tool.tool_name,
+                        name=tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value,
                         arguments=json.dumps(tool.arguments),
                     ),
                     type="function",
                 )
                 for tool in message.tool_calls
-            ],
+            ]
+            or None,
         )
     elif isinstance(message, ToolResponseMessage):
         out = OpenAIChatCompletionToolMessage(
@@ -858,7 +863,8 @@ async def convert_openai_chat_completion_stream(
     event_type = ChatCompletionResponseEventType.progress
 
     stop_reason = None
-    toolcall_buffer = {}
+    tool_call_idx_to_buffer = {}
+
     async for chunk in stream:
         choice = chunk.choices[0]  # assuming only one choice per chunk
 
@@ -868,7 +874,6 @@ async def convert_openai_chat_completion_stream(
 
         # if there's a tool call, emit an event for each tool in the list
         # if tool call and content, emit both separately
-
         if choice.delta.tool_calls:
             # the call may have content and a tool call. ChatCompletionResponseEvent
             # does not support both, so we emit the content first
@@ -889,44 +894,53 @@ async def convert_openai_chat_completion_stream(
                 )
 
             if not enable_incremental_tool_calls:
-                yield ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=next(event_type),
-                        delta=ToolCallDelta(
-                            tool_call=_convert_openai_tool_calls(choice.delta.tool_calls)[0],
-                            parse_status=ToolCallParseStatus.succeeded,
-                        ),
-                        logprobs=_convert_openai_logprobs(logprobs),
+                for tool_call in choice.delta.tool_calls:
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=event_type,
+                            delta=ToolCallDelta(
+                                tool_call=_convert_openai_tool_calls([tool_call])[0],
+                                parse_status=ToolCallParseStatus.succeeded,
+                            ),
+                            logprobs=_convert_openai_logprobs(logprobs),
+                        )
                     )
-                )
             else:
-                tool_call = choice.delta.tool_calls[0]
-                if "name" not in toolcall_buffer:
-                    toolcall_buffer["call_id"] = tool_call.id
-                    toolcall_buffer["name"] = None
-                    toolcall_buffer["content"] = ""
-                if "arguments" not in toolcall_buffer:
-                    toolcall_buffer["arguments"] = ""
+                for tool_call in choice.delta.tool_calls:
+                    idx = tool_call.index if hasattr(tool_call, "index") else 0
 
-                if tool_call.function.name:
-                    toolcall_buffer["name"] = tool_call.function.name
-                    delta = f"{toolcall_buffer['name']}("
-                if tool_call.function.arguments:
-                    toolcall_buffer["arguments"] += tool_call.function.arguments
-                    delta = toolcall_buffer["arguments"]
+                    if idx not in tool_call_idx_to_buffer:
+                        tool_call_idx_to_buffer[idx] = {
+                            "call_id": tool_call.id,
+                            "name": None,
+                            "arguments": "",
+                            "content": "",
+                        }
 
-                toolcall_buffer["content"] += delta
-                yield ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=event_type,
-                        delta=ToolCallDelta(
-                            tool_call=delta,
-                            parse_status=ToolCallParseStatus.in_progress,
-                        ),
-                        logprobs=_convert_openai_logprobs(logprobs),
-                    )
-                )
-        else:
+                    buffer = tool_call_idx_to_buffer[idx]
+
+                    if tool_call.function:
+                        if tool_call.function.name:
+                            buffer["name"] = tool_call.function.name
+                            delta = f"{buffer['name']}("
+                            buffer["content"] += delta
+
+                        if tool_call.function.arguments:
+                            delta = tool_call.function.arguments
+                            buffer["arguments"] += delta
+                            buffer["content"] += delta
+
+                        yield ChatCompletionResponseStreamChunk(
+                            event=ChatCompletionResponseEvent(
+                                event_type=event_type,
+                                delta=ToolCallDelta(
+                                    tool_call=delta,
+                                    parse_status=ToolCallParseStatus.in_progress,
+                                ),
+                                logprobs=_convert_openai_logprobs(logprobs),
+                            )
+                        )
+        elif choice.delta.content:
             yield ChatCompletionResponseStreamChunk(
                 event=ChatCompletionResponseEvent(
                     event_type=event_type,
@@ -935,47 +949,51 @@ async def convert_openai_chat_completion_stream(
                 )
             )
 
-    if toolcall_buffer:
-        delta = ")"
-        toolcall_buffer["content"] += delta
-        yield ChatCompletionResponseStreamChunk(
-            event=ChatCompletionResponseEvent(
-                event_type=event_type,
-                delta=ToolCallDelta(
-                    tool_call=delta,
-                    parse_status=ToolCallParseStatus.in_progress,
-                ),
-                logprobs=_convert_openai_logprobs(logprobs),
-            )
-        )
-        try:
-            arguments = json.loads(toolcall_buffer["arguments"])
-            tool_call = ToolCall(
-                call_id=toolcall_buffer["call_id"],
-                tool_name=toolcall_buffer["name"],
-                arguments=arguments,
-            )
+    for idx, buffer in tool_call_idx_to_buffer.items():
+        logger.debug(f"toolcall_buffer[{idx}]: {buffer}")
+        if buffer["name"]:
+            delta = ")"
+            buffer["content"] += delta
             yield ChatCompletionResponseStreamChunk(
                 event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.progress,
+                    event_type=event_type,
                     delta=ToolCallDelta(
-                        tool_call=tool_call,
-                        parse_status=ToolCallParseStatus.succeeded,
+                        tool_call=delta,
+                        parse_status=ToolCallParseStatus.in_progress,
                     ),
-                    stop_reason=stop_reason,
+                    logprobs=None,
                 )
             )
-        except json.JSONDecodeError:
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.complete,
-                    delta=ToolCallDelta(
-                        tool_call=toolcall_buffer["content"],
-                        parse_status=ToolCallParseStatus.failed,
-                    ),
-                    stop_reason=stop_reason,
+
+            try:
+                arguments = json.loads(buffer["arguments"])
+                tool_call = ToolCall(
+                    call_id=buffer["call_id"],
+                    tool_name=buffer["name"],
+                    arguments=arguments,
+                )
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            tool_call=tool_call,
+                            parse_status=ToolCallParseStatus.succeeded,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+            except json.JSONDecodeError as e:
+                print(f"Failed to parse arguments: {e}")
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            tool_call=buffer["content"],
+                            parse_status=ToolCallParseStatus.failed,
+                        ),
+                        stop_reason=stop_reason,
+                    )
                 )
-            )
 
     yield ChatCompletionResponseStreamChunk(
         event=ChatCompletionResponseEvent(
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index 61249ad17..ef0e8c05e 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -271,7 +271,7 @@ def test_custom_tool(llama_stack_client_with_mocked_inference, agent_config):
     client_tool = get_boiling_point
     agent_config = {
         **agent_config,
-        "tools": ["builtin::websearch", client_tool],
+        "tools": [client_tool],
     }
 
     agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
@@ -320,42 +320,55 @@ def test_custom_tool_infinite_loop(llama_stack_client_with_mocked_inference, age
     assert num_tool_calls <= 5
 
 
-def test_tool_choice(llama_stack_client_with_mocked_inference, agent_config):
-    def run_agent(tool_choice):
-        client_tool = get_boiling_point
-
-        test_agent_config = {
-            **agent_config,
-            "tool_config": {"tool_choice": tool_choice},
-            "tools": [client_tool],
-        }
-
-        agent = Agent(llama_stack_client_with_mocked_inference, **test_agent_config)
-        session_id = agent.create_session(f"test-session-{uuid4()}")
-
-        response = agent.create_turn(
-            messages=[
-                {
-                    "role": "user",
-                    "content": "What is the boiling point of polyjuice?",
-                },
-            ],
-            session_id=session_id,
-            stream=False,
-        )
-
-        return [step for step in response.steps if step.step_type == "tool_execution"]
-
-    tool_execution_steps = run_agent("required")
+def test_tool_choice_required(llama_stack_client_with_mocked_inference, agent_config):
+    tool_execution_steps = run_agent_with_tool_choice(
+        llama_stack_client_with_mocked_inference, agent_config, "required"
+    )
     assert len(tool_execution_steps) > 0
 
-    tool_execution_steps = run_agent("none")
+
+def test_tool_choice_none(llama_stack_client_with_mocked_inference, agent_config):
+    tool_execution_steps = run_agent_with_tool_choice(llama_stack_client_with_mocked_inference, agent_config, "none")
     assert len(tool_execution_steps) == 0
 
-    tool_execution_steps = run_agent("get_boiling_point")
+
+def test_tool_choice_get_boiling_point(llama_stack_client_with_mocked_inference, agent_config):
+    if "llama" not in agent_config["model"].lower():
+        pytest.xfail("NotImplemented for non-llama models")
+
+    tool_execution_steps = run_agent_with_tool_choice(
+        llama_stack_client_with_mocked_inference, agent_config, "get_boiling_point"
+    )
     assert len(tool_execution_steps) >= 1 and tool_execution_steps[0].tool_calls[0].tool_name == "get_boiling_point"
 
 
+def run_agent_with_tool_choice(client, agent_config, tool_choice):
+    client_tool = get_boiling_point
+
+    test_agent_config = {
+        **agent_config,
+        "tool_config": {"tool_choice": tool_choice},
+        "tools": [client_tool],
+        "max_infer_iters": 2,
+    }
+
+    agent = Agent(client, **test_agent_config)
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+
+    response = agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "What is the boiling point of polyjuice?",
+            },
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+
+    return [step for step in response.steps if step.step_type == "tool_execution"]
+
+
 @pytest.mark.parametrize("rag_tool_name", ["builtin::rag/knowledge_search", "builtin::rag"])
 def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_tool_name):
     urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]

From 37f155e41dae2891ef1c06c37674d961c5cb040a Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Mon, 17 Mar 2025 22:13:09 -0700
Subject: [PATCH 06/16] feat(agent): support multiple tool groups (#1556)

Summary:
closes #1488

Test Plan:
added new integration test
```
LLAMA_STACK_CONFIG=dev pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model openai/gpt-4o-mini
```
---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with
[ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1556).
* __->__ #1556
* #1550
---
 .../agents/meta_reference/agent_instance.py   | 213 ++++++++++--------
 .../inline/safety/llama_guard/llama_guard.py  |   7 -
 tests/integration/agents/test_agents.py       |  45 +++-
 3 files changed, 157 insertions(+), 108 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index aa27f421c..88b6e9697 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -614,118 +614,133 @@ class ChatAgent(ShieldRunnerMixin):
                     logger.debug(f"completion message with EOM (iter: {n_iter}): {str(message)}")
                     input_messages = input_messages + [message]
             else:
-                logger.debug(f"completion message (iter: {n_iter}) from the model: {str(message)}")
-                # 1. Start the tool execution step and progress
-                step_id = str(uuid.uuid4())
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepStartPayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                        )
-                    )
-                )
-                tool_call = message.tool_calls[0]
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepProgressPayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                            tool_call=tool_call,
-                            delta=ToolCallDelta(
-                                parse_status=ToolCallParseStatus.in_progress,
-                                tool_call=tool_call,
-                            ),
-                        )
-                    )
-                )
+                input_messages = input_messages + [message]
 
-                # If tool is a client tool, yield CompletionMessage and return
-                if tool_call.tool_name in client_tools:
-                    # NOTE: mark end_of_message to indicate to client that it may
-                    # call the tool and continue the conversation with the tool's response.
-                    message.stop_reason = StopReason.end_of_message
+                # Process tool calls in the message
+                client_tool_calls = []
+                non_client_tool_calls = []
+
+                # Separate client and non-client tool calls
+                for tool_call in message.tool_calls:
+                    if tool_call.tool_name in client_tools:
+                        client_tool_calls.append(tool_call)
+                    else:
+                        non_client_tool_calls.append(tool_call)
+
+                # Process non-client tool calls first
+                for tool_call in non_client_tool_calls:
+                    step_id = str(uuid.uuid4())
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepStartPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                            )
+                        )
+                    )
+
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepProgressPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                                delta=ToolCallDelta(
+                                    parse_status=ToolCallParseStatus.in_progress,
+                                    tool_call=tool_call,
+                                ),
+                            )
+                        )
+                    )
+
+                    # Execute the tool call
+                    async with tracing.span(
+                        "tool_execution",
+                        {
+                            "tool_name": tool_call.tool_name,
+                            "input": message.model_dump_json(),
+                        },
+                    ) as span:
+                        tool_execution_start_time = datetime.now(timezone.utc).isoformat()
+                        tool_result = await self.execute_tool_call_maybe(
+                            session_id,
+                            tool_call,
+                        )
+                        if tool_result.content is None:
+                            raise ValueError(
+                                f"Tool call result (id: {tool_call.call_id}, name: {tool_call.tool_name}) does not have any content"
+                            )
+                        result_message = ToolResponseMessage(
+                            call_id=tool_call.call_id,
+                            content=tool_result.content,
+                        )
+                        span.set_attribute("output", result_message.model_dump_json())
+
+                        # Store tool execution step
+                        tool_execution_step = ToolExecutionStep(
+                            step_id=step_id,
+                            turn_id=turn_id,
+                            tool_calls=[tool_call],
+                            tool_responses=[
+                                ToolResponse(
+                                    call_id=tool_call.call_id,
+                                    tool_name=tool_call.tool_name,
+                                    content=tool_result.content,
+                                    metadata=tool_result.metadata,
+                                )
+                            ],
+                            started_at=tool_execution_start_time,
+                            completed_at=datetime.now(timezone.utc).isoformat(),
+                        )
+
+                        # Yield the step completion event
+                        yield AgentTurnResponseStreamChunk(
+                            event=AgentTurnResponseEvent(
+                                payload=AgentTurnResponseStepCompletePayload(
+                                    step_type=StepType.tool_execution.value,
+                                    step_id=step_id,
+                                    step_details=tool_execution_step,
+                                )
+                            )
+                        )
+
+                        # Add the result message to input_messages for the next iteration
+                        input_messages.append(result_message)
+
+                        # TODO: add tool-input touchpoint and a "start" event for this step also
+                        # but that needs a lot more refactoring of Tool code potentially
+                        if (type(result_message.content) is str) and (
+                            out_attachment := _interpret_content_as_attachment(result_message.content)
+                        ):
+                            # NOTE: when we push this message back to the model, the model may ignore the
+                            # attached file path etc. since the model is trained to only provide a user message
+                            # with the summary. We keep all generated attachments and then attach them to final message
+                            output_attachments.append(out_attachment)
+
+                # If there are client tool calls, yield a message with only those tool calls
+                if client_tool_calls:
                     await self.storage.set_in_progress_tool_call_step(
                         session_id,
                         turn_id,
                         ToolExecutionStep(
                             step_id=step_id,
                             turn_id=turn_id,
-                            tool_calls=[tool_call],
+                            tool_calls=client_tool_calls,
                             tool_responses=[],
                             started_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
-                    yield message
+
+                    # Create a copy of the message with only client tool calls
+                    client_message = message.model_copy(deep=True)
+                    client_message.tool_calls = client_tool_calls
+                    # NOTE: mark end_of_message to indicate to client that it may
+                    # call the tool and continue the conversation with the tool's response.
+                    client_message.stop_reason = StopReason.end_of_message
+
+                    # Yield the message with client tool calls
+                    yield client_message
                     return
 
-                # If tool is a builtin server tool, execute it
-                tool_name = tool_call.tool_name
-                if isinstance(tool_name, BuiltinTool):
-                    tool_name = tool_name.value
-                async with tracing.span(
-                    "tool_execution",
-                    {
-                        "tool_name": tool_name,
-                        "input": message.model_dump_json(),
-                    },
-                ) as span:
-                    tool_execution_start_time = datetime.now(timezone.utc).isoformat()
-                    tool_call = message.tool_calls[0]
-                    tool_result = await self.execute_tool_call_maybe(
-                        session_id,
-                        tool_call,
-                    )
-                    if tool_result.content is None:
-                        raise ValueError(
-                            f"Tool call result (id: {tool_call.call_id}, name: {tool_call.tool_name}) does not have any content"
-                        )
-                    result_messages = [
-                        ToolResponseMessage(
-                            call_id=tool_call.call_id,
-                            content=tool_result.content,
-                        )
-                    ]
-                    assert len(result_messages) == 1, "Currently not supporting multiple messages"
-                    result_message = result_messages[0]
-                    span.set_attribute("output", result_message.model_dump_json())
-
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepCompletePayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                            step_details=ToolExecutionStep(
-                                step_id=step_id,
-                                turn_id=turn_id,
-                                tool_calls=[tool_call],
-                                tool_responses=[
-                                    ToolResponse(
-                                        call_id=result_message.call_id,
-                                        tool_name=tool_call.tool_name,
-                                        content=result_message.content,
-                                        metadata=tool_result.metadata,
-                                    )
-                                ],
-                                started_at=tool_execution_start_time,
-                                completed_at=datetime.now(timezone.utc).isoformat(),
-                            ),
-                        )
-                    )
-                )
-
-                # TODO: add tool-input touchpoint and a "start" event for this step also
-                # but that needs a lot more refactoring of Tool code potentially
-                if (type(result_message.content) is str) and (
-                    out_attachment := _interpret_content_as_attachment(result_message.content)
-                ):
-                    # NOTE: when we push this message back to the model, the model may ignore the
-                    # attached file path etc. since the model is trained to only provide a user message
-                    # with the summary. We keep all generated attachments and then attach them to final message
-                    output_attachments.append(out_attachment)
-
-                input_messages = input_messages + [message, result_message]
-
     async def _initialize_tools(
         self,
         toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
index af0987fa8..e514e3781 100644
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@@ -227,13 +227,6 @@ class LlamaGuardShield:
         if len(messages) >= 2 and (messages[0].role == Role.user.value and messages[1].role == Role.user.value):
             messages = messages[1:]
 
-        for i in range(1, len(messages)):
-            if messages[i].role == messages[i - 1].role:
-                for i, m in enumerate(messages):
-                    print(f"{i}: {m.role}: {m.content}")
-                raise ValueError(
-                    f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}"
-                )
         return messages
 
     async def run(self, messages: List[Message]) -> RunShieldResponse:
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index ef0e8c05e..581cc9f45 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -584,7 +584,7 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
     [(get_boiling_point, False), (get_boiling_point_with_metadata, True)],
 )
 def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_config, client_tools):
-    client_tool, expectes_metadata = client_tools
+    client_tool, expects_metadata = client_tools
     agent_config = {
         **agent_config,
         "input_shields": [],
@@ -610,7 +610,7 @@ def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_co
     assert steps[0].step_type == "inference"
     assert steps[1].step_type == "tool_execution"
     assert steps[1].tool_calls[0].tool_name.startswith("get_boiling_point")
-    if expectes_metadata:
+    if expects_metadata:
         assert steps[1].tool_responses[0].metadata["source"] == "https://www.google.com"
     assert steps[2].step_type == "inference"
 
@@ -622,3 +622,44 @@ def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_co
             assert last_step_completed_at < step.started_at
             assert step.started_at < step.completed_at
             last_step_completed_at = step.completed_at
+
+
+def test_multi_tool_calls(llama_stack_client_with_mocked_inference, agent_config):
+    if "gpt" not in agent_config["model"]:
+        pytest.xfail("Only tested on GPT models")
+
+    agent_config = {
+        **agent_config,
+        "tools": [get_boiling_point],
+    }
+
+    agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+
+    response = agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "Call get_boiling_point twice to answer: What is the boiling point of polyjuice in both celsius and fahrenheit?",
+            },
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+    steps = response.steps
+    assert len(steps) == 7
+    assert steps[0].step_type == "shield_call"
+    assert steps[1].step_type == "inference"
+    assert steps[2].step_type == "shield_call"
+    assert steps[3].step_type == "tool_execution"
+    assert steps[4].step_type == "shield_call"
+    assert steps[5].step_type == "inference"
+    assert steps[6].step_type == "shield_call"
+
+    tool_execution_step = steps[3]
+    assert len(tool_execution_step.tool_calls) == 2
+    assert tool_execution_step.tool_calls[0].tool_name.startswith("get_boiling_point")
+    assert tool_execution_step.tool_calls[1].tool_name.startswith("get_boiling_point")
+
+    output = response.output_message.content.lower()
+    assert "-100" in output and "-212" in output

From 168cbcbb92c2956475f0b1ee5434cd4b8416e8b2 Mon Sep 17 00:00:00 2001
From: Luis Tomas Bolivar <ltomasbo@redhat.com>
Date: Tue, 18 Mar 2025 14:33:35 +0100
Subject: [PATCH 07/16] fix: Add the option to not verify SSL at remote-vllm
 provider (#1585)

# What does this PR do?
Add the option to not verify SSL certificates for the remote-vllm
provider. This allows llama stack server to talk to remote LLMs which
have self-signed certificates

Partially addresses  #1545
---
 llama_stack/providers/remote/inference/vllm/config.py  | 5 +++++
 llama_stack/providers/remote/inference/vllm/vllm.py    | 7 ++++++-
 llama_stack/templates/remote-vllm/run-with-safety.yaml | 2 ++
 llama_stack/templates/remote-vllm/run.yaml             | 1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/remote/inference/vllm/config.py b/llama_stack/providers/remote/inference/vllm/config.py
index c75cc8926..762cffde3 100644
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@@ -25,6 +25,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
         default="fake",
         description="The API token",
     )
+    tls_verify: bool = Field(
+        default=True,
+        description="Whether to verify TLS certificates",
+    )
 
     @classmethod
     def sample_run_config(
@@ -36,4 +40,5 @@ class VLLMInferenceAdapterConfig(BaseModel):
             "url": url,
             "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
             "api_token": "${env.VLLM_API_TOKEN:fake}",
+            "tls_verify": "${env.VLLM_TLS_VERIFY:true}",
         }
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 4d7e66d78..f940de7ba 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -7,6 +7,7 @@ import json
 import logging
 from typing import AsyncGenerator, List, Optional, Union
 
+import httpx
 from openai import AsyncOpenAI
 from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk as OpenAIChatCompletionChunk,
@@ -229,7 +230,11 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
 
     async def initialize(self) -> None:
         log.info(f"Initializing VLLM client with base_url={self.config.url}")
-        self.client = AsyncOpenAI(base_url=self.config.url, api_key=self.config.api_token)
+        self.client = AsyncOpenAI(
+            base_url=self.config.url,
+            api_key=self.config.api_token,
+            http_client=None if self.config.tls_verify else httpx.AsyncClient(verify=False),
+        )
 
     async def shutdown(self) -> None:
         pass
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 9741f5302..3830ffcdb 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -18,12 +18,14 @@ providers:
       url: ${env.VLLM_URL}
       max_tokens: ${env.VLLM_MAX_TOKENS:4096}
       api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
   - provider_id: vllm-safety
     provider_type: remote::vllm
     config:
       url: ${env.SAFETY_VLLM_URL}
       max_tokens: ${env.VLLM_MAX_TOKENS:4096}
       api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index e26b20e88..b6bba1252 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -18,6 +18,7 @@ providers:
       url: ${env.VLLM_URL}
       max_tokens: ${env.VLLM_MAX_TOKENS:4096}
       api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}

From ffe9b3b278fa726dece81a1ea150db7ed1705754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 18 Mar 2025 16:54:42 +0100
Subject: [PATCH 08/16] ci(ollama): run more integration tests (#1636)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
Run additional tests in a matrix to accelerate the process and clearly
identify failing providers.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/integration-tests.yml | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index ec782c331..86adf8a15 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -15,8 +15,14 @@ on:
       - '.github/workflows/integration-tests.yml' # This workflow
 
 jobs:
-  ollama:
+  test-matrix:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # Listing tests manually since some of them currently fail
+        # TODO: generate matrix list from tests/integration when fixed
+        test-type: [inference, datasets, inspect, scoring, post_training, providers]
+      fail-fast: false # we want to run all tests regardless of failure
 
     steps:
       - name: Checkout repository
@@ -43,6 +49,8 @@ jobs:
         run: |
           uv sync --extra dev --extra test
           uv pip install ollama faiss-cpu
+          # always test against the latest version of the client
+          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
           uv pip install -e .
 
       - name: Wait for Ollama to start
@@ -72,17 +80,17 @@ jobs:
           echo "Waiting for Llama Stack server..."
           for i in {1..30}; do
             if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
-              echo " Llama Stack server is up!"
+              echo "Llama Stack server is up!"
               exit 0
             fi
             sleep 1
           done
-          echo " Llama Stack server failed to start"
+          echo "Llama Stack server failed to start"
           cat server.log
           exit 1
 
-      - name: Run Inference Integration Tests
+      - name: Run Integration Tests
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
-          uv run pytest -v tests/integration/inference --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
+          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2

From f4dc29070546c59392d83f84c94d9b59191871ae Mon Sep 17 00:00:00 2001
From: Jamie Land <38305141+jland-redhat@users.noreply.github.com>
Date: Tue, 18 Mar 2025 12:26:49 -0400
Subject: [PATCH 09/16] feat: Created Playground Containerfile and Image
 Workflow (#1256)

# What does this PR do?
Adds a container file that can be used to build the playground UI.

This file will be built by this PR in the stack-ops repo:
https://github.com/meta-llama/llama-stack-ops/pull/9

Docker command in the docs will need to change once I know the address
of the official repository.

## Test Plan

Tested image on my local Openshift Instance using this helm chart:
https://github.com/Jaland/llama-stack-helm/tree/main/llama-stack

[//]: # (## Documentation)

---------

Co-authored-by: Jamie Land <hokie10@gmail.com>
---
 docs/source/playground/index.md           | 27 +++++++++++++++++++++++
 llama_stack/distribution/ui/Containerfile | 11 +++++++++
 llama_stack/distribution/ui/README.md     | 10 +++++++++
 3 files changed, 48 insertions(+)
 create mode 100644 llama_stack/distribution/ui/Containerfile

diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md
index 9691609ab..1d52de73f 100644
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@@ -92,6 +92,8 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
 
 ## Starting the Llama Stack Playground
 
+### Llama CLI
+
 To start the Llama Stack Playground, run the following commands:
 
 1. Start up the Llama Stack API server
@@ -107,3 +109,28 @@ cd llama_stack/distribution/ui
 pip install -r requirements.txt
 streamlit run app.py
 ```
+
+### Docker
+
+Playground can also be started in a docker image:
+
+```sh
+export LLAMA_STACK_URL=http://localhost:11434
+
+docker run \
+  -p 8501:8501 \
+  -e LLAMA_STACK_ENDPOINT=$LLAMA_STACK_URL \
+  quay.io/jland/llama-stack-playground
+```
+
+## Configurable Environment Variables
+
+## Environment Variables
+
+| Environment Variable       | Description                        | Default Value             |
+|----------------------------|------------------------------------|---------------------------|
+| LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
+| FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
+| TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
+| SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
+| OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
diff --git a/llama_stack/distribution/ui/Containerfile b/llama_stack/distribution/ui/Containerfile
new file mode 100644
index 000000000..a97f25753
--- /dev/null
+++ b/llama_stack/distribution/ui/Containerfile
@@ -0,0 +1,11 @@
+# More info on playground configuration can be found here:
+# https://llama-stack.readthedocs.io/en/latest/playground
+
+FROM python:3.9-slim
+WORKDIR /app
+COPY . /app/
+RUN /usr/local/bin/python -m pip install --upgrade pip && \
+    /usr/local/bin/pip3 install -r requirements.txt
+EXPOSE 8501
+
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
index f3df3f07a..fe660544f 100644
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@@ -40,3 +40,13 @@ cd llama_stack/distribution/ui
 pip install -r requirements.txt
 streamlit run app.py
 ```
+
+## Environment Variables
+
+| Environment Variable       | Description                        | Default Value             |
+|----------------------------|------------------------------------|---------------------------|
+| LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
+| FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
+| TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
+| SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
+| OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |

From 706b4ca651928941ec778bccbf1b9f5751025b79 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 18 Mar 2025 13:54:10 -0500
Subject: [PATCH 10/16] feat: support nvidia hosted vision models (llama 3.2
 11b/90b) (#1278)

# What does this PR do?

support nvidia hosted 3.2 11b/90b vision models. they are not hosted on
the common https://integrate.api.nvidia.com/v1. they are hosted on their
own individual urls.

## Test Plan

`LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -s -v
tests/client-sdk/inference/test_vision_inference.py
--inference-model=meta/llama-3.2-11b-vision-instruct -k image`
---
 .../remote/inference/nvidia/nvidia.py         | 51 +++++++++++++++----
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index b59da79eb..69e6335c6 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -6,6 +6,7 @@
 
 import logging
 import warnings
+from functools import lru_cache
 from typing import AsyncIterator, List, Optional, Union
 
 from openai import APIConnectionError, AsyncOpenAI, BadRequestError
@@ -82,12 +83,42 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         #     )
 
         self._config = config
-        # make sure the client lives longer than any async calls
-        self._client = AsyncOpenAI(
-            base_url=f"{self._config.url}/v1",
-            api_key=(self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"),
-            timeout=self._config.timeout,
-        )
+
+    @lru_cache  # noqa: B019
+    def _get_client(self, provider_model_id: str) -> AsyncOpenAI:
+        """
+        For hosted models, https://integrate.api.nvidia.com/v1 is the primary base_url. However,
+        some models are hosted on different URLs. This function returns the appropriate client
+        for the given provider_model_id.
+
+        This relies on lru_cache and self._default_client to avoid creating a new client for each request
+        or for each model that is hosted on https://integrate.api.nvidia.com/v1.
+
+        :param provider_model_id: The provider model ID
+        :return: An OpenAI client
+        """
+
+        @lru_cache  # noqa: B019
+        def _get_client_for_base_url(base_url: str) -> AsyncOpenAI:
+            """
+            Maintain a single OpenAI client per base_url.
+            """
+            return AsyncOpenAI(
+                base_url=base_url,
+                api_key=(self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"),
+                timeout=self._config.timeout,
+            )
+
+        special_model_urls = {
+            "meta/llama-3.2-11b-vision-instruct": "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct",
+            "meta/llama-3.2-90b-vision-instruct": "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct",
+        }
+
+        base_url = f"{self._config.url}/v1"
+        if _is_nvidia_hosted(self._config) and provider_model_id in special_model_urls:
+            base_url = special_model_urls[provider_model_id]
+
+        return _get_client_for_base_url(base_url)
 
     async def completion(
         self,
@@ -105,9 +136,10 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
 
         await check_health(self._config)  # this raises errors
 
+        provider_model_id = self.get_provider_model_id(model_id)
         request = convert_completion_request(
             request=CompletionRequest(
-                model=self.get_provider_model_id(model_id),
+                model=provider_model_id,
                 content=content,
                 sampling_params=sampling_params,
                 response_format=response_format,
@@ -118,7 +150,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         )
 
         try:
-            response = await self._client.completions.create(**request)
+            response = await self._get_client(provider_model_id).completions.create(**request)
         except APIConnectionError as e:
             raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
 
@@ -206,6 +238,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
 
         await check_health(self._config)  # this raises errors
 
+        provider_model_id = self.get_provider_model_id(model_id)
         request = await convert_chat_completion_request(
             request=ChatCompletionRequest(
                 model=self.get_provider_model_id(model_id),
@@ -221,7 +254,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         )
 
         try:
-            response = await self._client.chat.completions.create(**request)
+            response = await self._get_client(provider_model_id).chat.completions.create(**request)
         except APIConnectionError as e:
             raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
 

From 814eb753216bf1e12c26a6e9267449e3fc167dbf Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Tue, 18 Mar 2025 15:17:21 -0400
Subject: [PATCH 11/16] chore: enable ruff for ./scripts too (#1643)

# What does this PR do?

Enable ruff for scripts.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 pyproject.toml           | 1 -
 scripts/gen-changelog.py | 8 +++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a006d69f9..7972bf211 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,7 +114,6 @@ exclude = [
     "./.git",
     "./docs/*",
     "./build",
-    "./scripts",
     "./venv",
     "*.pyi",
     ".pre-commit-config.yaml",
diff --git a/scripts/gen-changelog.py b/scripts/gen-changelog.py
index ac4053339..3df2af06b 100755
--- a/scripts/gen-changelog.py
+++ b/scripts/gen-changelog.py
@@ -11,7 +11,7 @@ import requests
 
 
 def get_all_releases(token):
-    url = f"https://api.github.com/repos/meta-llama/llama-stack/releases"
+    url = "https://api.github.com/repos/meta-llama/llama-stack/releases"
     headers = {"Accept": "application/vnd.github.v3+json"}
 
     if token:
@@ -22,9 +22,7 @@ def get_all_releases(token):
     if response.status_code == 200:
         return response.json()
     else:
-        raise Exception(
-            f"Error fetching releases: {response.status_code}, {response.text}"
-        )
+        raise Exception(f"Error fetching releases: {response.status_code}, {response.text}")
 
 
 def clean_release_body(body):
@@ -55,7 +53,7 @@ def merge_release_notes(output_file, token=None):
     releases = get_all_releases(token)
 
     with open(output_file, "w", encoding="utf-8") as md_file:
-        md_file.write(f"# Changelog\n\n")
+        md_file.write("# Changelog\n\n")
 
         for release in releases:
             md_file.write(f"# {release['tag_name']}\n")

From 141b3c14dd23822117e8136d6ab466ca09170e3a Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Tue, 18 Mar 2025 16:39:46 -0400
Subject: [PATCH 12/16] docs: fix broken test path in CONTRIBUTING.md (#1679)

# What does this PR do?
fix broken test path in CONTRIBUTING.md

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e458fec0a..505d6b162 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -86,7 +86,7 @@ LLAMA_STACK_CONFIG=
 
 And then use this dotenv file when running client SDK tests via the following:
 ```bash
-uv run --env-file .env -- pytest -v tests/api/inference/test_text_inference.py
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py
 ```
 
 ## Pre-commit Hooks

From cca9bd6cc3d0fc3046b91bd6fecf9b4a85f466cf Mon Sep 17 00:00:00 2001
From: Daniele Martinoli <86618610+dmartinol@users.noreply.github.com>
Date: Tue, 18 Mar 2025 22:04:21 +0100
Subject: [PATCH 13/16] feat: Qdrant inline provider (#1273)

# What does this PR do?
Removed local execution option from the remote Qdrant provider and
introduced an explicit inline provider for the embedded execution.
Updated the ollama template to include this option: this part can be
reverted in case we don't want to have two default `vector_io`
providers.

(Closes #1082)

## Test Plan
Build and run an ollama distro:
```bash
llama stack build --template ollama --image-type conda
llama stack run --image-type conda ollama
```

Run one of the sample ingestionapplicatinos like
[rag_with_vector_db.py](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/rag_with_vector_db.py),
but replace this line:
```py
    selected_vector_provider = vector_providers[0]
```
with the following, to use the `qdrant` provider:
```py
    selected_vector_provider = vector_providers[1]
```

After running the test code, verify the timestamp of the Qdrant store:
```bash
% ls -ltr ~/.llama/distributions/ollama/qdrant.db/collection/test_vector_db_*
total 784
-rw-r--r--@ 1 dmartino  staff  401408 Feb 26 10:07 storage.sqlite
```

[//]: # (## Documentation)

---------

Signed-off-by: Daniele Martinoli <dmartino@redhat.com>
Co-authored-by: Francisco Arceo <farceo@redhat.com>
---
 docs/source/providers/vector_io/qdrant.md     |  21 +-
 .../inline/vector_io/qdrant/__init__.py       |  19 ++
 .../inline/vector_io/qdrant/config.py         |  23 ++
 llama_stack/providers/registry/vector_io.py   |   8 +
 .../remote/vector_io/qdrant/config.py         |   1 -
 .../remote/vector_io/qdrant/qdrant.py         |  20 +-
 pyproject.toml                                |   3 +-
 tests/unit/providers/vector_io/conftest.py    |  42 ++++
 tests/unit/providers/vector_io/test_qdrant.py | 135 ++++++++++++
 .../providers/vector_io/test_sqlite_vec.py    |  32 +--
 uv.lock                                       | 198 +++++++++++++++++-
 11 files changed, 454 insertions(+), 48 deletions(-)
 create mode 100644 llama_stack/providers/inline/vector_io/qdrant/__init__.py
 create mode 100644 llama_stack/providers/inline/vector_io/qdrant/config.py
 create mode 100644 tests/unit/providers/vector_io/conftest.py
 create mode 100644 tests/unit/providers/vector_io/test_qdrant.py

diff --git a/docs/source/providers/vector_io/qdrant.md b/docs/source/providers/vector_io/qdrant.md
index a0de0be98..8b0cbeef8 100644
--- a/docs/source/providers/vector_io/qdrant.md
+++ b/docs/source/providers/vector_io/qdrant.md
@@ -3,21 +3,36 @@ orphan: true
 ---
 # Qdrant
 
-[Qdrant](https://qdrant.tech/documentation/) is a remote vector database provider for Llama Stack. It
+[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
 allows you to store and query vectors directly in memory.
 That means you'll get fast and efficient vector retrieval.
 
+> By default, Qdrant stores vectors in RAM, delivering incredibly fast access for datasets that fit comfortably in
+> memory. But when your dataset exceeds RAM capacity, Qdrant offers Memmap as an alternative.
+>
+> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
+
+
+
 ## Features
 
-- Easy to use
+- Lightweight and easy to use
 - Fully integrated with Llama Stack
+- Apache 2.0 license terms
+- Store embeddings and their metadata
+- Supports search by
+  [Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
+  and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
+- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
+- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
+- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
 
 ## Usage
 
 To use Qdrant in your Llama Stack project, follow these steps:
 
 1. Install the necessary dependencies.
-2. Configure your Llama Stack project to use Faiss.
+2. Configure your Llama Stack project to use Qdrant.
 3. Start storing and querying vectors.
 
 ## Installation
diff --git a/llama_stack/providers/inline/vector_io/qdrant/__init__.py b/llama_stack/providers/inline/vector_io/qdrant/__init__.py
new file mode 100644
index 000000000..8f0b91c61
--- /dev/null
+++ b/llama_stack/providers/inline/vector_io/qdrant/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Dict
+
+from llama_stack.providers.datatypes import Api, ProviderSpec
+
+from .config import QdrantVectorIOConfig
+
+
+async def get_adapter_impl(config: QdrantVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+    from llama_stack.providers.remote.vector_io.qdrant.qdrant import QdrantVectorIOAdapter
+
+    impl = QdrantVectorIOAdapter(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/inline/vector_io/qdrant/config.py b/llama_stack/providers/inline/vector_io/qdrant/config.py
new file mode 100644
index 000000000..282e951b0
--- /dev/null
+++ b/llama_stack/providers/inline/vector_io/qdrant/config.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any, Dict
+
+from pydantic import BaseModel
+
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class QdrantVectorIOConfig(BaseModel):
+    path: str
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+        return {
+            "path": "${env.QDRANT_PATH:~/.llama/" + __distro_dir__ + "}/" + "qdrant.db",
+        }
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index fbc495d83..93031763d 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -92,6 +92,14 @@ def available_providers() -> List[ProviderSpec]:
             ),
             api_dependencies=[Api.inference],
         ),
+        InlineProviderSpec(
+            api=Api.vector_io,
+            provider_type="inline::qdrant",
+            pip_packages=["qdrant-client"],
+            module="llama_stack.providers.inline.vector_io.qdrant",
+            config_class="llama_stack.providers.inline.vector_io.qdrant.QdrantVectorIOConfig",
+            api_dependencies=[Api.inference],
+        ),
         remote_provider_spec(
             Api.vector_io,
             AdapterSpec(
diff --git a/llama_stack/providers/remote/vector_io/qdrant/config.py b/llama_stack/providers/remote/vector_io/qdrant/config.py
index ce68aa492..6d7eebe23 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/config.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/config.py
@@ -23,7 +23,6 @@ class QdrantVectorIOConfig(BaseModel):
     prefix: Optional[str] = None
     timeout: Optional[int] = None
     host: Optional[str] = None
-    path: Optional[str] = None
 
     @classmethod
     def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
index 586b8ca95..9e7788dc0 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@@ -6,7 +6,7 @@
 
 import logging
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from numpy.typing import NDArray
 from qdrant_client import AsyncQdrantClient, models
@@ -16,12 +16,13 @@ from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
+from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
     EmbeddingIndex,
     VectorDBWithIndex,
 )
 
-from .config import QdrantVectorIOConfig
+from .config import QdrantVectorIOConfig as RemoteQdrantVectorIOConfig
 
 log = logging.getLogger(__name__)
 CHUNK_ID_KEY = "_chunk_id"
@@ -99,17 +100,19 @@ class QdrantIndex(EmbeddingIndex):
 
 
 class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
-    def __init__(self, config: QdrantVectorIOConfig, inference_api: Api.inference) -> None:
+    def __init__(
+        self, config: Union[RemoteQdrantVectorIOConfig, InlineQdrantVectorIOConfig], inference_api: Api.inference
+    ) -> None:
         self.config = config
-        self.client = AsyncQdrantClient(**self.config.model_dump(exclude_none=True))
+        self.client: AsyncQdrantClient = None
         self.cache = {}
         self.inference_api = inference_api
 
     async def initialize(self) -> None:
-        pass
+        self.client = AsyncQdrantClient(**self.config.model_dump(exclude_none=True))
 
     async def shutdown(self) -> None:
-        self.client.close()
+        await self.client.close()
 
     async def register_vector_db(
         self,
@@ -123,6 +126,11 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
 
         self.cache[vector_db.identifier] = index
 
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        if vector_db_id in self.cache:
+            await self.cache[vector_db_id].index.delete()
+            del self.cache[vector_db_id]
+
     async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
         if vector_db_id in self.cache:
             return self.cache[vector_db_id]
diff --git a/pyproject.toml b/pyproject.toml
index 7972bf211..f57b91462 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ dev = [
     "ruamel.yaml",      # needed for openapi generator
 ]
 # These are the dependencies required for running unit tests.
-unit = ["sqlite-vec", "openai", "aiosqlite", "pypdf", "chardet"]
+unit = ["sqlite-vec", "openai", "aiosqlite", "pypdf", "chardet", "qdrant-client"]
 # These are the core dependencies required for running integration tests. They are shared across all
 # providers. If a provider requires additional dependencies, please add them to your environment
 # separately. If you are using "uv" to execute your tests, you can use the "--with" flag to specify extra
@@ -247,6 +247,7 @@ exclude = [
     "^llama_stack/providers/inline/vector_io/chroma/",
     "^llama_stack/providers/inline/vector_io/faiss/",
     "^llama_stack/providers/inline/vector_io/milvus/",
+    "^llama_stack/providers/inline/vector_io/qdrant/",
     "^llama_stack/providers/inline/vector_io/sqlite_vec/",
     "^llama_stack/providers/remote/agents/sample/",
     "^llama_stack/providers/remote/datasetio/huggingface/",
diff --git a/tests/unit/providers/vector_io/conftest.py b/tests/unit/providers/vector_io/conftest.py
new file mode 100644
index 000000000..3bcd0613f
--- /dev/null
+++ b/tests/unit/providers/vector_io/conftest.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import random
+
+import numpy as np
+import pytest
+
+from llama_stack.apis.vector_io import Chunk
+
+EMBEDDING_DIMENSION = 384
+
+
+@pytest.fixture
+def vector_db_id() -> str:
+    return f"test-vector-db-{random.randint(1, 100)}"
+
+
+@pytest.fixture(scope="session")
+def embedding_dimension() -> int:
+    return EMBEDDING_DIMENSION
+
+
+@pytest.fixture(scope="session")
+def sample_chunks():
+    """Generates chunks that force multiple batches for a single document to expose ID conflicts."""
+    n, k = 10, 3
+    sample = [
+        Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
+        for j in range(k)
+        for i in range(n)
+    ]
+    return sample
+
+
+@pytest.fixture(scope="session")
+def sample_embeddings(sample_chunks):
+    np.random.seed(42)
+    return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks])
diff --git a/tests/unit/providers/vector_io/test_qdrant.py b/tests/unit/providers/vector_io/test_qdrant.py
new file mode 100644
index 000000000..bc97719c0
--- /dev/null
+++ b/tests/unit/providers/vector_io/test_qdrant.py
@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import os
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+import pytest_asyncio
+
+from llama_stack.apis.inference import EmbeddingsResponse, Inference
+from llama_stack.apis.vector_io import (
+    QueryChunksResponse,
+    VectorDB,
+    VectorDBStore,
+)
+from llama_stack.providers.inline.vector_io.qdrant.config import (
+    QdrantVectorIOConfig as InlineQdrantVectorIOConfig,
+)
+from llama_stack.providers.remote.vector_io.qdrant.qdrant import (
+    QdrantVectorIOAdapter,
+)
+
+# This test is a unit test for the QdrantVectorIOAdapter class. This should only contain
+# tests which are specific to this class. More general (API-level) tests should be placed in
+# tests/integration/vector_io/
+#
+# How to run this test:
+#
+# pytest tests/unit/providers/vector_io/test_qdrant.py \
+# -v -s --tb=short --disable-warnings --asyncio-mode=auto
+
+
+@pytest.fixture
+def qdrant_config(tmp_path) -> InlineQdrantVectorIOConfig:
+    return InlineQdrantVectorIOConfig(path=os.path.join(tmp_path, "qdrant.db"))
+
+
+@pytest.fixture(scope="session")
+def loop():
+    return asyncio.new_event_loop()
+
+
+@pytest.fixture
+def mock_vector_db(vector_db_id) -> MagicMock:
+    mock_vector_db = MagicMock(spec=VectorDB)
+    mock_vector_db.embedding_model = "embedding_model"
+    mock_vector_db.identifier = vector_db_id
+    return mock_vector_db
+
+
+@pytest.fixture
+def mock_vector_db_store(mock_vector_db) -> MagicMock:
+    mock_store = MagicMock(spec=VectorDBStore)
+    mock_store.get_vector_db = AsyncMock(return_value=mock_vector_db)
+    return mock_store
+
+
+@pytest.fixture
+def mock_api_service(sample_embeddings):
+    mock_api_service = MagicMock(spec=Inference)
+    mock_api_service.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings))
+    return mock_api_service
+
+
+@pytest_asyncio.fixture
+async def qdrant_adapter(qdrant_config, mock_vector_db_store, mock_api_service, loop) -> QdrantVectorIOAdapter:
+    adapter = QdrantVectorIOAdapter(config=qdrant_config, inference_api=mock_api_service)
+    adapter.vector_db_store = mock_vector_db_store
+    await adapter.initialize()
+    yield adapter
+    await adapter.shutdown()
+
+
+__QUERY = "Sample query"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 30)])
+async def test_qdrant_adapter_returns_expected_chunks(
+    qdrant_adapter: QdrantVectorIOAdapter,
+    vector_db_id,
+    sample_chunks,
+    sample_embeddings,
+    max_query_chunks,
+    expected_chunks,
+) -> None:
+    assert qdrant_adapter is not None
+    await qdrant_adapter.insert_chunks(vector_db_id, sample_chunks)
+
+    index = await qdrant_adapter._get_and_cache_vector_db_index(vector_db_id=vector_db_id)
+    assert index is not None
+
+    response = await qdrant_adapter.query_chunks(
+        query=__QUERY,
+        vector_db_id=vector_db_id,
+        params={"max_chunks": max_query_chunks},
+    )
+    assert isinstance(response, QueryChunksResponse)
+    assert len(response.chunks) == expected_chunks
+
+
+# To by-pass attempt to convert a Mock to JSON
+def _prepare_for_json(value: Any) -> str:
+    return str(value)
+
+
+@patch("llama_stack.providers.utils.telemetry.trace_protocol._prepare_for_json", new=_prepare_for_json)
+@pytest.mark.asyncio
+async def test_qdrant_register_and_unregister_vector_db(
+    qdrant_adapter: QdrantVectorIOAdapter,
+    mock_vector_db,
+    sample_chunks,
+) -> None:
+    # Initially, no collections
+    vector_db_id = mock_vector_db.identifier
+    assert len((await qdrant_adapter.client.get_collections()).collections) == 0
+
+    # Register does not create a collection
+    assert not (await qdrant_adapter.client.collection_exists(vector_db_id))
+    await qdrant_adapter.register_vector_db(mock_vector_db)
+    assert not (await qdrant_adapter.client.collection_exists(vector_db_id))
+
+    # First insert creates the collection
+    await qdrant_adapter.insert_chunks(vector_db_id, sample_chunks)
+    assert await qdrant_adapter.client.collection_exists(vector_db_id)
+
+    # Unregister deletes the collection
+    await qdrant_adapter.unregister_vector_db(vector_db_id)
+    assert not (await qdrant_adapter.client.collection_exists(vector_db_id))
+    assert len((await qdrant_adapter.client.get_collections()).collections) == 0
diff --git a/tests/unit/providers/vector_io/test_sqlite_vec.py b/tests/unit/providers/vector_io/test_sqlite_vec.py
index eb5660a85..cff988c53 100644
--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@@ -29,8 +29,6 @@ from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
 # -v -s --tb=short --disable-warnings --asyncio-mode=auto
 
 SQLITE_VEC_PROVIDER = "sqlite_vec"
-EMBEDDING_DIMENSION = 384
-EMBEDDING_MODEL = "all-MiniLM-L6-v2"
 
 
 @pytest.fixture(scope="session")
@@ -50,26 +48,8 @@ def sqlite_connection(loop):
 
 
 @pytest_asyncio.fixture(scope="session", autouse=True)
-async def sqlite_vec_index(sqlite_connection):
-    return await SQLiteVecIndex.create(dimension=EMBEDDING_DIMENSION, connection=sqlite_connection, bank_id="test_bank")
-
-
-@pytest.fixture(scope="session")
-def sample_chunks():
-    """Generates chunks that force multiple batches for a single document to expose ID conflicts."""
-    n, k = 10, 3
-    sample = [
-        Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
-        for j in range(k)
-        for i in range(n)
-    ]
-    return sample
-
-
-@pytest.fixture(scope="session")
-def sample_embeddings(sample_chunks):
-    np.random.seed(42)
-    return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks])
+async def sqlite_vec_index(sqlite_connection, embedding_dimension):
+    return await SQLiteVecIndex.create(dimension=embedding_dimension, connection=sqlite_connection, bank_id="test_bank")
 
 
 @pytest.mark.asyncio
@@ -82,21 +62,21 @@ async def test_add_chunks(sqlite_vec_index, sample_chunks, sample_embeddings):
 
 
 @pytest.mark.asyncio
-async def test_query_chunks(sqlite_vec_index, sample_chunks, sample_embeddings):
+async def test_query_chunks(sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension):
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
-    query_embedding = np.random.rand(EMBEDDING_DIMENSION).astype(np.float32)
+    query_embedding = np.random.rand(embedding_dimension).astype(np.float32)
     response = await sqlite_vec_index.query(query_embedding, k=2, score_threshold=0.0)
     assert isinstance(response, QueryChunksResponse)
     assert len(response.chunks) == 2
 
 
 @pytest.mark.asyncio
-async def test_chunk_id_conflict(sqlite_vec_index, sample_chunks):
+async def test_chunk_id_conflict(sqlite_vec_index, sample_chunks, embedding_dimension):
     """Test that chunk IDs do not conflict across batches when inserting chunks."""
     # Reduce batch size to force multiple batches for same document
     # since there are 10 chunks per document and batch size is 2
     batch_size = 2
-    sample_embeddings = np.random.rand(len(sample_chunks), EMBEDDING_DIMENSION).astype(np.float32)
+    sample_embeddings = np.random.rand(len(sample_chunks), embedding_dimension).astype(np.float32)
 
     await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings, batch_size=batch_size)
 
diff --git a/uv.lock b/uv.lock
index 860b29241..b63d23b14 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,4 @@
 version = 1
-revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
     "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -8,9 +7,12 @@ resolution-markers = [
     "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
     "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'darwin'",
-    "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.12' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform == 'darwin'",
+    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
 ]
 
 [[package]]
@@ -793,6 +795,107 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/89/30/2bd0eb03a7dee7727cd2ec643d1e992979e62d5e7443507381cce0455132/googleapis_common_protos-1.67.0-py2.py3-none-any.whl", hash = "sha256:579de760800d13616f51cf8be00c876f00a9f146d3e6510e19d1f4111758b741", size = 164985 },
 ]
 
+[[package]]
+name = "grpcio"
+version = "1.71.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1c/95/aa11fc09a85d91fbc7dd405dcb2a1e0256989d67bf89fa65ae24b3ba105a/grpcio-1.71.0.tar.gz", hash = "sha256:2b85f7820475ad3edec209d3d89a7909ada16caab05d3f2e08a7e8ae3200a55c", size = 12549828 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/c5/ef610b3f988cc0cc67b765f72b8e2db06a1db14e65acb5ae7810a6b7042e/grpcio-1.71.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:c200cb6f2393468142eb50ab19613229dcc7829b5ccee8b658a36005f6669fdd", size = 5210643 },
+    { url = "https://files.pythonhosted.org/packages/bf/de/c84293c961622df302c0d5d07ec6e2d4cd3874ea42f602be2df09c4ad44f/grpcio-1.71.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:b2266862c5ad664a380fbbcdbdb8289d71464c42a8c29053820ee78ba0119e5d", size = 11308962 },
+    { url = "https://files.pythonhosted.org/packages/7c/38/04c9e0dc8c904570c80faa1f1349b190b63e45d6b2782ec8567b050efa9d/grpcio-1.71.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0ab8b2864396663a5b0b0d6d79495657ae85fa37dcb6498a2669d067c65c11ea", size = 5699236 },
+    { url = "https://files.pythonhosted.org/packages/95/96/e7be331d1298fa605ea7c9ceafc931490edd3d5b33c4f695f1a0667f3491/grpcio-1.71.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c30f393f9d5ff00a71bb56de4aa75b8fe91b161aeb61d39528db6b768d7eac69", size = 6339767 },
+    { url = "https://files.pythonhosted.org/packages/5d/b7/7e7b7bb6bb18baf156fd4f2f5b254150dcdd6cbf0def1ee427a2fb2bfc4d/grpcio-1.71.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f250ff44843d9a0615e350c77f890082102a0318d66a99540f54769c8766ab73", size = 5943028 },
+    { url = "https://files.pythonhosted.org/packages/13/aa/5fb756175995aeb47238d706530772d9a7ac8e73bcca1b47dc145d02c95f/grpcio-1.71.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e6d8de076528f7c43a2f576bc311799f89d795aa6c9b637377cc2b1616473804", size = 6031841 },
+    { url = "https://files.pythonhosted.org/packages/54/93/172783e01eed61f7f180617b7fa4470f504e383e32af2587f664576a7101/grpcio-1.71.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9b91879d6da1605811ebc60d21ab6a7e4bae6c35f6b63a061d61eb818c8168f6", size = 6651039 },
+    { url = "https://files.pythonhosted.org/packages/6f/99/62654b220a27ed46d3313252214f4bc66261143dc9b58004085cd0646753/grpcio-1.71.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f71574afdf944e6652203cd1badcda195b2a27d9c83e6d88dc1ce3cfb73b31a5", size = 6198465 },
+    { url = "https://files.pythonhosted.org/packages/68/35/96116de833b330abe4412cc94edc68f99ed2fa3e39d8713ff307b3799e81/grpcio-1.71.0-cp310-cp310-win32.whl", hash = "sha256:8997d6785e93308f277884ee6899ba63baafa0dfb4729748200fcc537858a509", size = 3620382 },
+    { url = "https://files.pythonhosted.org/packages/b7/09/f32ef637e386f3f2c02effac49699229fa560ce9007682d24e9e212d2eb4/grpcio-1.71.0-cp310-cp310-win_amd64.whl", hash = "sha256:7d6ac9481d9d0d129224f6d5934d5832c4b1cddb96b59e7eba8416868909786a", size = 4280302 },
+    { url = "https://files.pythonhosted.org/packages/63/04/a085f3ad4133426f6da8c1becf0749872a49feb625a407a2e864ded3fb12/grpcio-1.71.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:d6aa986318c36508dc1d5001a3ff169a15b99b9f96ef5e98e13522c506b37eef", size = 5210453 },
+    { url = "https://files.pythonhosted.org/packages/b4/d5/0bc53ed33ba458de95020970e2c22aa8027b26cc84f98bea7fcad5d695d1/grpcio-1.71.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:d2c170247315f2d7e5798a22358e982ad6eeb68fa20cf7a820bb74c11f0736e7", size = 11347567 },
+    { url = "https://files.pythonhosted.org/packages/e3/6d/ce334f7e7a58572335ccd61154d808fe681a4c5e951f8a1ff68f5a6e47ce/grpcio-1.71.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:e6f83a583ed0a5b08c5bc7a3fe860bb3c2eac1f03f1f63e0bc2091325605d2b7", size = 5696067 },
+    { url = "https://files.pythonhosted.org/packages/05/4a/80befd0b8b1dc2b9ac5337e57473354d81be938f87132e147c4a24a581bd/grpcio-1.71.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4be74ddeeb92cc87190e0e376dbc8fc7736dbb6d3d454f2fa1f5be1dee26b9d7", size = 6348377 },
+    { url = "https://files.pythonhosted.org/packages/c7/67/cbd63c485051eb78663355d9efd1b896cfb50d4a220581ec2cb9a15cd750/grpcio-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dd0dfbe4d5eb1fcfec9490ca13f82b089a309dc3678e2edabc144051270a66e", size = 5940407 },
+    { url = "https://files.pythonhosted.org/packages/98/4b/7a11aa4326d7faa499f764eaf8a9b5a0eb054ce0988ee7ca34897c2b02ae/grpcio-1.71.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a2242d6950dc892afdf9e951ed7ff89473aaf744b7d5727ad56bdaace363722b", size = 6030915 },
+    { url = "https://files.pythonhosted.org/packages/eb/a2/cdae2d0e458b475213a011078b0090f7a1d87f9a68c678b76f6af7c6ac8c/grpcio-1.71.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0fa05ee31a20456b13ae49ad2e5d585265f71dd19fbd9ef983c28f926d45d0a7", size = 6648324 },
+    { url = "https://files.pythonhosted.org/packages/27/df/f345c8daaa8d8574ce9869f9b36ca220c8845923eb3087e8f317eabfc2a8/grpcio-1.71.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3d081e859fb1ebe176de33fc3adb26c7d46b8812f906042705346b314bde32c3", size = 6197839 },
+    { url = "https://files.pythonhosted.org/packages/f2/2c/cd488dc52a1d0ae1bad88b0d203bc302efbb88b82691039a6d85241c5781/grpcio-1.71.0-cp311-cp311-win32.whl", hash = "sha256:d6de81c9c00c8a23047136b11794b3584cdc1460ed7cbc10eada50614baa1444", size = 3619978 },
+    { url = "https://files.pythonhosted.org/packages/ee/3f/cf92e7e62ccb8dbdf977499547dfc27133124d6467d3a7d23775bcecb0f9/grpcio-1.71.0-cp311-cp311-win_amd64.whl", hash = "sha256:24e867651fc67717b6f896d5f0cac0ec863a8b5fb7d6441c2ab428f52c651c6b", size = 4282279 },
+    { url = "https://files.pythonhosted.org/packages/4c/83/bd4b6a9ba07825bd19c711d8b25874cd5de72c2a3fbf635c3c344ae65bd2/grpcio-1.71.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:0ff35c8d807c1c7531d3002be03221ff9ae15712b53ab46e2a0b4bb271f38537", size = 5184101 },
+    { url = "https://files.pythonhosted.org/packages/31/ea/2e0d90c0853568bf714693447f5c73272ea95ee8dad107807fde740e595d/grpcio-1.71.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:b78a99cd1ece4be92ab7c07765a0b038194ded2e0a26fd654591ee136088d8d7", size = 11310927 },
+    { url = "https://files.pythonhosted.org/packages/ac/bc/07a3fd8af80467390af491d7dc66882db43884128cdb3cc8524915e0023c/grpcio-1.71.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc1a1231ed23caac1de9f943d031f1bc38d0f69d2a3b243ea0d664fc1fbd7fec", size = 5654280 },
+    { url = "https://files.pythonhosted.org/packages/16/af/21f22ea3eed3d0538b6ef7889fce1878a8ba4164497f9e07385733391e2b/grpcio-1.71.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6beeea5566092c5e3c4896c6d1d307fb46b1d4bdf3e70c8340b190a69198594", size = 6312051 },
+    { url = "https://files.pythonhosted.org/packages/49/9d/e12ddc726dc8bd1aa6cba67c85ce42a12ba5b9dd75d5042214a59ccf28ce/grpcio-1.71.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5170929109450a2c031cfe87d6716f2fae39695ad5335d9106ae88cc32dc84c", size = 5910666 },
+    { url = "https://files.pythonhosted.org/packages/d9/e9/38713d6d67aedef738b815763c25f092e0454dc58e77b1d2a51c9d5b3325/grpcio-1.71.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:5b08d03ace7aca7b2fadd4baf291139b4a5f058805a8327bfe9aece7253b6d67", size = 6012019 },
+    { url = "https://files.pythonhosted.org/packages/80/da/4813cd7adbae6467724fa46c952d7aeac5e82e550b1c62ed2aeb78d444ae/grpcio-1.71.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f903017db76bf9cc2b2d8bdd37bf04b505bbccad6be8a81e1542206875d0e9db", size = 6637043 },
+    { url = "https://files.pythonhosted.org/packages/52/ca/c0d767082e39dccb7985c73ab4cf1d23ce8613387149e9978c70c3bf3b07/grpcio-1.71.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:469f42a0b410883185eab4689060a20488a1a0a00f8bbb3cbc1061197b4c5a79", size = 6186143 },
+    { url = "https://files.pythonhosted.org/packages/00/61/7b2c8ec13303f8fe36832c13d91ad4d4ba57204b1c723ada709c346b2271/grpcio-1.71.0-cp312-cp312-win32.whl", hash = "sha256:ad9f30838550695b5eb302add33f21f7301b882937460dd24f24b3cc5a95067a", size = 3604083 },
+    { url = "https://files.pythonhosted.org/packages/fd/7c/1e429c5fb26122055d10ff9a1d754790fb067d83c633ff69eddcf8e3614b/grpcio-1.71.0-cp312-cp312-win_amd64.whl", hash = "sha256:652350609332de6dac4ece254e5d7e1ff834e203d6afb769601f286886f6f3a8", size = 4272191 },
+    { url = "https://files.pythonhosted.org/packages/04/dd/b00cbb45400d06b26126dcfdbdb34bb6c4f28c3ebbd7aea8228679103ef6/grpcio-1.71.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:cebc1b34ba40a312ab480ccdb396ff3c529377a2fce72c45a741f7215bfe8379", size = 5184138 },
+    { url = "https://files.pythonhosted.org/packages/ed/0a/4651215983d590ef53aac40ba0e29dda941a02b097892c44fa3357e706e5/grpcio-1.71.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:85da336e3649a3d2171e82f696b5cad2c6231fdd5bad52616476235681bee5b3", size = 11310747 },
+    { url = "https://files.pythonhosted.org/packages/57/a3/149615b247f321e13f60aa512d3509d4215173bdb982c9098d78484de216/grpcio-1.71.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f9a412f55bb6e8f3bb000e020dbc1e709627dcb3a56f6431fa7076b4c1aab0db", size = 5653991 },
+    { url = "https://files.pythonhosted.org/packages/ca/56/29432a3e8d951b5e4e520a40cd93bebaa824a14033ea8e65b0ece1da6167/grpcio-1.71.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47be9584729534660416f6d2a3108aaeac1122f6b5bdbf9fd823e11fe6fbaa29", size = 6312781 },
+    { url = "https://files.pythonhosted.org/packages/a3/f8/286e81a62964ceb6ac10b10925261d4871a762d2a763fbf354115f9afc98/grpcio-1.71.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9c80ac6091c916db81131d50926a93ab162a7e97e4428ffc186b6e80d6dda4", size = 5910479 },
+    { url = "https://files.pythonhosted.org/packages/35/67/d1febb49ec0f599b9e6d4d0d44c2d4afdbed9c3e80deb7587ec788fcf252/grpcio-1.71.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:789d5e2a3a15419374b7b45cd680b1e83bbc1e52b9086e49308e2c0b5bbae6e3", size = 6013262 },
+    { url = "https://files.pythonhosted.org/packages/a1/04/f9ceda11755f0104a075ad7163fc0d96e2e3a9fe25ef38adfc74c5790daf/grpcio-1.71.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1be857615e26a86d7363e8a163fade914595c81fec962b3d514a4b1e8760467b", size = 6643356 },
+    { url = "https://files.pythonhosted.org/packages/fb/ce/236dbc3dc77cf9a9242adcf1f62538734ad64727fabf39e1346ad4bd5c75/grpcio-1.71.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a76d39b5fafd79ed604c4be0a869ec3581a172a707e2a8d7a4858cb05a5a7637", size = 6186564 },
+    { url = "https://files.pythonhosted.org/packages/10/fd/b3348fce9dd4280e221f513dd54024e765b21c348bc475516672da4218e9/grpcio-1.71.0-cp313-cp313-win32.whl", hash = "sha256:74258dce215cb1995083daa17b379a1a5a87d275387b7ffe137f1d5131e2cfbb", size = 3601890 },
+    { url = "https://files.pythonhosted.org/packages/be/f8/db5d5f3fc7e296166286c2a397836b8b042f7ad1e11028d82b061701f0f7/grpcio-1.71.0-cp313-cp313-win_amd64.whl", hash = "sha256:22c3bc8d488c039a199f7a003a38cb7635db6656fa96437a8accde8322ce2366", size = 4273308 },
+]
+
+[[package]]
+name = "grpcio-tools"
+version = "1.71.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "protobuf" },
+    { name = "setuptools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/d2/c0866a48c355a6a4daa1f7e27e210c7fa561b1f3b7c0bce2671e89cfa31e/grpcio_tools-1.71.0.tar.gz", hash = "sha256:38dba8e0d5e0fb23a034e09644fdc6ed862be2371887eee54901999e8f6792a8", size = 5326008 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f9/60/aa7f261eda558d018457e5c8bd8a8079136e5107a0942fd3167477ab50e2/grpcio_tools-1.71.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:f4ad7f0d756546902597053d70b3af2606fbd70d7972876cd75c1e241d22ae00", size = 2385558 },
+    { url = "https://files.pythonhosted.org/packages/0d/e3/e47b96e93e51398ba3462e027d93a10c0c23fffc31733de9bd4f44a2b867/grpcio_tools-1.71.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:64bdb291df61cf570b5256777ad5fe2b1db6d67bc46e55dc56a0a862722ae329", size = 5930039 },
+    { url = "https://files.pythonhosted.org/packages/a6/69/5d8920002483b2a65ae3b03329dfe3b668c3592f001d5358e1538f540012/grpcio_tools-1.71.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:8dd9795e982d77a4b496f7278b943c2563d9afde2069cdee78c111a40cc4d675", size = 2351932 },
+    { url = "https://files.pythonhosted.org/packages/c4/50/8116e307662a2337cdc3f0e1a8b23af197129448b7ff7e0cf1a76c9b0178/grpcio_tools-1.71.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1b5860c41a36b26fec4f52998f1a451d0525a5c9a4fb06b6ea3e9211abdb925", size = 2744962 },
+    { url = "https://files.pythonhosted.org/packages/e3/4b/d95be4aaf78d7b02dff3bd332c75c228288178e92af0e5228759ac5002a0/grpcio_tools-1.71.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3059c14035e5dc03d462f261e5900b9a077fd1a36976c3865b8507474520bad4", size = 2476716 },
+    { url = "https://files.pythonhosted.org/packages/37/c2/c784a3705b1a1fd277751a8fc881d5a29325a460b9211e3c6164f594b178/grpcio_tools-1.71.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f360981b215b1d5aff9235b37e7e1826246e35bbac32a53e41d4e990a37b8f4c", size = 2854132 },
+    { url = "https://files.pythonhosted.org/packages/93/8f/173adbf72ed3996e1962182b55abf30151edc8b53daac0bf15cc3dc4b09e/grpcio_tools-1.71.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bfe3888c3bbe16a5aa39409bc38744a31c0c3d2daa2b0095978c56e106c85b42", size = 3305069 },
+    { url = "https://files.pythonhosted.org/packages/e4/a8/b1e7df63e7f83336275922f92ded1cd6918964c511280b31c872c54538f4/grpcio_tools-1.71.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:145985c0bf12131f0a1503e65763e0f060473f7f3928ed1ff3fb0e8aad5bc8ac", size = 2916636 },
+    { url = "https://files.pythonhosted.org/packages/be/a3/53f1e74c6e1c92ad94d7a0127a60fe913276a3e8c864737a053a1574b05c/grpcio_tools-1.71.0-cp310-cp310-win32.whl", hash = "sha256:82c430edd939bb863550ee0fecf067d78feff828908a1b529bbe33cc57f2419c", size = 949576 },
+    { url = "https://files.pythonhosted.org/packages/97/43/4a3ae830c1405bcb1ba47f2225779dbe9fc009ba341d4a90012919304855/grpcio_tools-1.71.0-cp310-cp310-win_amd64.whl", hash = "sha256:83e90724e3f02415c628e4ead1d6ffe063820aaaa078d9a39176793df958cd5a", size = 1121087 },
+    { url = "https://files.pythonhosted.org/packages/5d/ec/73b9797ffec80e1faf039ce3e2f0513e26e1a68eedc525ed294ae2a44d03/grpcio_tools-1.71.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:1f19b16b49afa5d21473f49c0966dd430c88d089cd52ac02404d8cef67134efb", size = 2385557 },
+    { url = "https://files.pythonhosted.org/packages/bf/87/42c6e192b7b09c9610a53e771797f7826aee4f6e769683985ae406a2d862/grpcio_tools-1.71.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:459c8f5e00e390aecd5b89de67deb3ec7188a274bc6cb50e43cef35ab3a3f45d", size = 5954404 },
+    { url = "https://files.pythonhosted.org/packages/25/30/3fd385a56d32dce34cde09a64dbaf7cf85d395f2bcd86dd41e4b4ee5938f/grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:edab7e6518de01196be37f96cb1e138c3819986bf5e2a6c9e1519b4d716b2f5a", size = 2352061 },
+    { url = "https://files.pythonhosted.org/packages/87/eb/e9971c7693a2d85e7f55760f7906211a95ff74af4d41b05d187849d7fb58/grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8b93b9f6adc7491d4c10144c0643409db298e5e63c997106a804f6f0248dbaf4", size = 2745033 },
+    { url = "https://files.pythonhosted.org/packages/15/72/4e69beae87a1b334f80da9e93c8e2f5c8fe4860c956a781246a092dc4c97/grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ae5f2efa9e644c10bf1021600bfc099dfbd8e02b184d2d25dc31fcd6c2bc59e", size = 2476743 },
+    { url = "https://files.pythonhosted.org/packages/b5/f3/336d2c83f1bfc00a5376bf20dd2273d7aa891b03dd91b11c71ca47392351/grpcio_tools-1.71.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:65aa082f4435571d65d5ce07fc444f23c3eff4f3e34abef599ef8c9e1f6f360f", size = 2853693 },
+    { url = "https://files.pythonhosted.org/packages/62/ba/cc7ace518c11501a4b8620df5edb8188e81470e5b82dc6829212f3e9b2ff/grpcio_tools-1.71.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1331e726e08b7bdcbf2075fcf4b47dff07842b04845e6e220a08a4663e232d7f", size = 3304474 },
+    { url = "https://files.pythonhosted.org/packages/00/0d/4b843654af3d5aa2f1a5775df1d583e6e3471e6d569106fd3213ad185a98/grpcio_tools-1.71.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6693a7d3ba138b0e693b3d1f687cdd9db9e68976c3fa2b951c17a072fea8b583", size = 2916147 },
+    { url = "https://files.pythonhosted.org/packages/e4/14/047e1c817422bc3d434247b9c640c51fd51ca4e047583ff31d927c3dea73/grpcio_tools-1.71.0-cp311-cp311-win32.whl", hash = "sha256:6d11ed3ff7b6023b5c72a8654975324bb98c1092426ba5b481af406ff559df00", size = 949374 },
+    { url = "https://files.pythonhosted.org/packages/86/cb/739a1b6d517672693796022c0f9061f63eaa243ec70cbbfa59bf881ed9fb/grpcio_tools-1.71.0-cp311-cp311-win_amd64.whl", hash = "sha256:072b2a5805ac97e4623b3aa8f7818275f3fb087f4aa131b0fce00471065f6eaa", size = 1120786 },
+    { url = "https://files.pythonhosted.org/packages/de/e4/156956b92ad0298290c3d68e6670bc5a6fbefcccfe1ec3997480605e7135/grpcio_tools-1.71.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:61c0409d5bdac57a7bd0ce0ab01c1c916728fe4c8a03d77a25135ad481eb505c", size = 2385480 },
+    { url = "https://files.pythonhosted.org/packages/c1/08/9930eb4bb38c5214041c9f24f8b35e9864a7938282db986836546c782d52/grpcio_tools-1.71.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:28784f39921d061d2164a9dcda5164a69d07bf29f91f0ea50b505958292312c9", size = 5951891 },
+    { url = "https://files.pythonhosted.org/packages/73/65/931f29ec9c33719d48e1e30446ecce6f5d2cd4e4934fa73fbe07de41c43b/grpcio_tools-1.71.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:192808cf553cedca73f0479cc61d5684ad61f24db7a5f3c4dfe1500342425866", size = 2351967 },
+    { url = "https://files.pythonhosted.org/packages/b8/26/2ec8748534406214f20a4809c36efcfa88d1a26246e8312102e3ef8c295d/grpcio_tools-1.71.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:989ee9da61098230d3d4c8f8f8e27c2de796f1ff21b1c90110e636d9acd9432b", size = 2745003 },
+    { url = "https://files.pythonhosted.org/packages/f1/33/87b4610c86a4e10ee446b543a4d536f94ab04f828bab841f0bc1a083de72/grpcio_tools-1.71.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:541a756276c8a55dec991f6c0106ae20c8c8f5ce8d0bdbfcb01e2338d1a8192b", size = 2476455 },
+    { url = "https://files.pythonhosted.org/packages/00/7c/f7f0cc36a43be9d45b3ce2a55245f3c7d063a24b7930dd719929e58871a4/grpcio_tools-1.71.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:870c0097700d13c403e5517cb7750ab5b4a791ce3e71791c411a38c5468b64bd", size = 2854333 },
+    { url = "https://files.pythonhosted.org/packages/07/c4/34b9ea62b173c13fa7accba5f219355b320c05c80c79c3ba70fe52f47b2f/grpcio_tools-1.71.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:abd57f615e88bf93c3c6fd31f923106e3beb12f8cd2df95b0d256fa07a7a0a57", size = 3304297 },
+    { url = "https://files.pythonhosted.org/packages/5c/ef/9d3449db8a07688dc3de7dcbd2a07048a128610b1a491c5c0cb3e90a00c5/grpcio_tools-1.71.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:753270e2d06d37e6d7af8967d1d059ec635ad215882041a36294f4e2fd502b2e", size = 2916212 },
+    { url = "https://files.pythonhosted.org/packages/2e/c6/990e8194c934dfe7cf89ef307c319fa4f2bc0b78aeca707addbfa1e502f1/grpcio_tools-1.71.0-cp312-cp312-win32.whl", hash = "sha256:0e647794bd7138b8c215e86277a9711a95cf6a03ff6f9e555d54fdf7378b9f9d", size = 948849 },
+    { url = "https://files.pythonhosted.org/packages/42/95/3c36d3205e6bd19853cc2420e44b6ef302eb4cfcf56498973c7e85f6c03b/grpcio_tools-1.71.0-cp312-cp312-win_amd64.whl", hash = "sha256:48debc879570972d28bfe98e4970eff25bb26da3f383e0e49829b2d2cd35ad87", size = 1120294 },
+    { url = "https://files.pythonhosted.org/packages/84/a7/70dc7e9957bcbaccd4dcb6cc11215e0b918f546d55599221522fe0d073e0/grpcio_tools-1.71.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:9a78d07d6c301a25ef5ede962920a522556a1dfee1ccc05795994ceb867f766c", size = 2384758 },
+    { url = "https://files.pythonhosted.org/packages/65/79/57320b28d0a0c5ec94095fd571a65292f8ed7e1c47e59ae4021e8a48d49b/grpcio_tools-1.71.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:580ac88141c9815557e63c9c04f5b1cdb19b4db8d0cb792b573354bde1ee8b12", size = 5951661 },
+    { url = "https://files.pythonhosted.org/packages/80/3d/343df5ed7c5dd66fc7a19e4ef3e97ccc4f5d802122b04cd6492f0dcd79f5/grpcio_tools-1.71.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f7c678e68ece0ae908ecae1c4314a0c2c7f83e26e281738b9609860cc2c82d96", size = 2351571 },
+    { url = "https://files.pythonhosted.org/packages/56/2f/b9736e8c84e880c4237f5b880c6c799b4977c5cde190999bc7ab4b2ec445/grpcio_tools-1.71.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56ecd6cc89b5e5eed1de5eb9cafce86c9c9043ee3840888cc464d16200290b53", size = 2744580 },
+    { url = "https://files.pythonhosted.org/packages/76/9b/bdb384967353da7bf64bac4232f4cf8ae43f19d0f2f640978d4d4197e667/grpcio_tools-1.71.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52a041afc20ab2431d756b6295d727bd7adee813b21b06a3483f4a7a15ea15f", size = 2475978 },
+    { url = "https://files.pythonhosted.org/packages/26/71/1411487fd7862d347b98fda5e3beef611a71b2ac2faac62a965d9e2536b3/grpcio_tools-1.71.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:2a1712f12102b60c8d92779b89d0504e0d6f3a59f2b933e5622b8583f5c02992", size = 2853314 },
+    { url = "https://files.pythonhosted.org/packages/03/06/59d0523eb1ba2f64edc72cb150152fa1b2e77061cae3ef3ecd3ef2a87f51/grpcio_tools-1.71.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:41878cb7a75477e62fdd45e7e9155b3af1b7a5332844021e2511deaf99ac9e6c", size = 3303981 },
+    { url = "https://files.pythonhosted.org/packages/c2/71/fb9fb49f2b738ec1dfbbc8cdce0b26e5f9c5fc0edef72e453580620d6a36/grpcio_tools-1.71.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:682e958b476049ccc14c71bedf3f979bced01f6e0c04852efc5887841a32ad6b", size = 2915876 },
+    { url = "https://files.pythonhosted.org/packages/bd/0f/0d49f6fe6fa2d09e9820dd9eeb30437e86002303076be2b6ada0fb52b8f2/grpcio_tools-1.71.0-cp313-cp313-win32.whl", hash = "sha256:0ccfb837152b7b858b9f26bb110b3ae8c46675d56130f6c2f03605c4f129be13", size = 948245 },
+    { url = "https://files.pythonhosted.org/packages/bb/14/ab131a39187bfea950280b2277a82d2033469fe8c86f73b10b19f53cc5ca/grpcio_tools-1.71.0-cp313-cp313-win_amd64.whl", hash = "sha256:ffff9bc5eacb34dd26b487194f7d44a3e64e752fc2cf049d798021bf25053b87", size = 1119649 },
+]
+
 [[package]]
 name = "h11"
 version = "0.14.0"
@@ -802,6 +905,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 },
 ]
 
+[[package]]
+name = "h2"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "hpack" },
+    { name = "hyperframe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1b/38/d7f80fd13e6582fb8e0df8c9a653dcc02b03ca34f4d72f34869298c5baf8/h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f", size = 2150682 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/9e/984486f2d0a0bd2b024bf4bc1c62688fcafa9e61991f041fb0e2def4a982/h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0", size = 60957 },
+]
+
+[[package]]
+name = "hpack"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357 },
+]
+
 [[package]]
 name = "httpcore"
 version = "1.0.7"
@@ -830,6 +955,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
 ]
 
+[package.optional-dependencies]
+http2 = [
+    { name = "h2" },
+]
+
 [[package]]
 name = "httpx-sse"
 version = "0.4.0"
@@ -857,6 +987,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/4d/8092df2cb0cafa9fcaf691db851b2fccfe9cad4048e081436bbbdf56e4e1/huggingface_hub-0.29.0-py3-none-any.whl", hash = "sha256:c02daa0b6bafbdacb1320fdfd1dc7151d0940825c88c4ef89837fdb1f6ea0afe", size = 468012 },
 ]
 
+[[package]]
+name = "hyperframe"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007 },
+]
+
 [[package]]
 name = "identify"
 version = "2.6.7"
@@ -1250,6 +1389,7 @@ unit = [
     { name = "chardet" },
     { name = "openai" },
     { name = "pypdf" },
+    { name = "qdrant-client" },
     { name = "sqlite-vec" },
 ]
 
@@ -1290,6 +1430,7 @@ requires-dist = [
     { name = "pytest-cov", marker = "extra == 'dev'" },
     { name = "pytest-html", marker = "extra == 'dev'" },
     { name = "python-dotenv" },
+    { name = "qdrant-client", marker = "extra == 'unit'" },
     { name = "requests" },
     { name = "rich" },
     { name = "rich", marker = "extra == 'codegen'" },
@@ -1314,7 +1455,6 @@ requires-dist = [
     { name = "types-setuptools", marker = "extra == 'dev'" },
     { name = "uvicorn", marker = "extra == 'dev'" },
 ]
-provides-extras = ["dev", "unit", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"
@@ -2062,6 +2202,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
 ]
 
+[[package]]
+name = "portalocker"
+version = "2.10.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pywin32", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/fb/a70a4214956182e0d7a9099ab17d50bfcba1056188e9b14f35b9e2b62a0d/portalocker-2.10.1-py3-none-any.whl", hash = "sha256:53a5984ebc86a025552264b459b46a2086e269b21823cb572f8f28ee759e45bf", size = 18423 },
+]
+
 [[package]]
 name = "pre-commit"
 version = "4.1.0"
@@ -2668,6 +2820,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e3/fe/72e7e166bda3885810bee7b23049133e142f7c80c295bae02c562caeea16/pyzmq-26.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bd8fdee945b877aa3bffc6a5a8816deb048dab0544f9df3731ecd0e54d8c84c9", size = 556563 },
 ]
 
+[[package]]
+name = "qdrant-client"
+version = "1.13.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "grpcio-tools" },
+    { name = "httpx", extra = ["http2"] },
+    { name = "numpy" },
+    { name = "portalocker" },
+    { name = "pydantic" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/58/1e4acd7ff7637ed56a66e5044699e7af6067232703d0b34f05068fc6234b/qdrant_client-1.13.3.tar.gz", hash = "sha256:61ca09e07c6d7ac0dfbdeb13dca4fe5f3e08fa430cb0d74d66ef5d023a70adfc", size = 266278 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/b4/bd676f91f5234ab59282e4a110f324029684482cbe08e7a1c77b6338013b/qdrant_client-1.13.3-py3-none-any.whl", hash = "sha256:f52cacbb936e547d3fceb1aaed3e3c56be0ebfd48e8ea495ea3dbc89c671d1d2", size = 306674 },
+]
+
 [[package]]
 name = "rapidfuzz"
 version = "3.12.2"
@@ -3417,7 +3587,8 @@ source = { registry = "https://download.pytorch.org/whl/cpu" }
 resolution-markers = [
     "python_full_version < '3.11' and sys_platform == 'darwin'",
     "python_full_version == '3.11.*' and sys_platform == 'darwin'",
-    "python_full_version >= '3.12' and sys_platform == 'darwin'",
+    "python_full_version >= '3.13' and sys_platform == 'darwin'",
+    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
 ]
 dependencies = [
     { name = "filelock", marker = "sys_platform == 'darwin'" },
@@ -3444,8 +3615,10 @@ resolution-markers = [
     "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
     "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
 ]
 dependencies = [
     { name = "filelock", marker = "sys_platform != 'darwin'" },
@@ -3482,8 +3655,10 @@ resolution-markers = [
     "python_full_version < '3.11' and sys_platform == 'darwin'",
     "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'darwin'",
-    "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.12' and sys_platform == 'darwin'",
+    "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform == 'darwin'",
+    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
 ]
 dependencies = [
     { name = "numpy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
@@ -3509,7 +3684,8 @@ source = { registry = "https://download.pytorch.org/whl/cpu" }
 resolution-markers = [
     "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
     "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
 ]
 dependencies = [
     { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },

From c029fbcd13ff270888f3e34e5369fb9d750821d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 18 Mar 2025 22:06:53 +0100
Subject: [PATCH 14/16] fix: return 4xx for non-existent resources in GET
 requests (#1635)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

- Removed Optional return types for GET methods
- Raised ValueError when requested resource is not found
- Ensures proper 4xx response for missing resources
- Updated the API generator to check for wrong signatures

```
$ uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh
Validating API method return types...

API Method Return Type Validation Errors:

Method ScoringFunctions.get_scoring_function returns Optional type
```

Closes: https://github.com/meta-llama/llama-stack/issues/1630

## Test Plan

Run the server then:

```
curl http://127.0.0.1:8321/v1/models/foo
{"detail":"Invalid value: Model 'foo' not found"}%
```

Server log:

```
INFO:     127.0.0.1:52307 - "GET /v1/models/foo HTTP/1.1" 400 Bad Request
09:51:42.654 [END] /v1/models/foo [StatusCode.OK] (134.65ms)
 09:51:42.651 [ERROR] Error executing endpoint route='/v1/models/{model_id:path}' method='get'
Traceback (most recent call last):
  File "/Users/leseb/Documents/AI/llama-stack/llama_stack/distribution/server/server.py", line 193, in endpoint
    return await maybe_await(value)
  File "/Users/leseb/Documents/AI/llama-stack/llama_stack/distribution/server/server.py", line 156, in maybe_await
    return await value
  File "/Users/leseb/Documents/AI/llama-stack/llama_stack/providers/utils/telemetry/trace_protocol.py", line 102, in async_wrapper
    result = await method(self, *args, **kwargs)
  File "/Users/leseb/Documents/AI/llama-stack/llama_stack/distribution/routers/routing_tables.py", line 217, in get_model
    raise ValueError(f"Model '{model_id}' not found")
ValueError: Model 'foo' not found
```

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 docs/_static/llama-stack-spec.html            | 90 +++----------------
 docs/_static/llama-stack-spec.yaml            | 40 +++------
 docs/openapi_generator/generate.py            | 12 ++-
 docs/openapi_generator/pyopenapi/utility.py   | 39 +++++++-
 llama_stack/apis/benchmarks/benchmarks.py     |  2 +-
 llama_stack/apis/datasets/datasets.py         |  2 +-
 llama_stack/apis/eval/eval.py                 |  2 +-
 llama_stack/apis/files/files.py               |  2 +-
 llama_stack/apis/models/models.py             |  2 +-
 .../apis/post_training/post_training.py       |  4 +-
 .../scoring_functions/scoring_functions.py    |  2 +-
 llama_stack/apis/shields/shields.py           |  2 +-
 llama_stack/apis/vector_dbs/vector_dbs.py     |  2 +-
 .../distribution/routers/routing_tables.py    | 47 +++++++---
 14 files changed, 112 insertions(+), 136 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 210a84b03..72b2e6b17 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -1101,14 +1101,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Benchmark"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Benchmark"
                                 }
                             }
                         }
@@ -1150,14 +1143,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Dataset"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Dataset"
                                 }
                             }
                         }
@@ -1232,14 +1218,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Model"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Model"
                                 }
                             }
                         }
@@ -1314,14 +1293,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ScoringFn"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/ScoringFn"
                                 }
                             }
                         }
@@ -1363,14 +1335,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Shield"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Shield"
                                 }
                             }
                         }
@@ -1673,14 +1638,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/PostTrainingJobArtifactsResponse"
                                 }
                             }
                         }
@@ -1722,14 +1680,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/PostTrainingJobStatusResponse"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/PostTrainingJobStatusResponse"
                                 }
                             }
                         }
@@ -1804,14 +1755,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/FileUploadResponse"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/FileUploadResponse"
                                 }
                             }
                         }
@@ -1913,14 +1857,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/VectorDB"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/VectorDB"
                                 }
                             }
                         }
@@ -2246,14 +2183,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/JobStatus"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/JobStatus"
                                 }
                             }
                         }
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a1eb07444..6f4a9528b 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -757,9 +757,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Benchmark'
-                  - type: 'null'
+                $ref: '#/components/schemas/Benchmark'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -787,9 +785,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Dataset'
-                  - type: 'null'
+                $ref: '#/components/schemas/Dataset'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -840,9 +836,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Model'
-                  - type: 'null'
+                $ref: '#/components/schemas/Model'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -893,9 +887,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/ScoringFn'
-                  - type: 'null'
+                $ref: '#/components/schemas/ScoringFn'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -923,9 +915,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Shield'
-                  - type: 'null'
+                $ref: '#/components/schemas/Shield'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1127,9 +1117,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/PostTrainingJobArtifactsResponse'
-                  - type: 'null'
+                $ref: '#/components/schemas/PostTrainingJobArtifactsResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1157,9 +1145,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/PostTrainingJobStatusResponse'
-                  - type: 'null'
+                $ref: '#/components/schemas/PostTrainingJobStatusResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1210,9 +1196,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/FileUploadResponse'
-                  - type: 'null'
+                $ref: '#/components/schemas/FileUploadResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1281,9 +1265,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/VectorDB'
-                  - type: 'null'
+                $ref: '#/components/schemas/VectorDB'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1509,9 +1491,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/JobStatus'
-                  - type: 'null'
+                $ref: '#/components/schemas/JobStatus'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index a2553f905..879ac95e2 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -12,7 +12,7 @@
 
 from datetime import datetime
 from pathlib import Path
-
+import sys
 import fire
 import ruamel.yaml as yaml
 
@@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack  # noqa: E402
 
 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
-from .pyopenapi.utility import Specification  # noqa: E402
+from .pyopenapi.utility import Specification, validate_api_method_return_types  # noqa: E402
 
 
 def str_presenter(dumper, data):
@@ -39,6 +39,14 @@ def main(output_dir: str):
     if not output_dir.exists():
         raise ValueError(f"Directory {output_dir} does not exist")
 
+    # Validate API protocols before generating spec
+    print("Validating API method return types...")
+    return_type_errors = validate_api_method_return_types()
+    if return_type_errors:
+        print("\nAPI Method Return Type Validation Errors:\n")
+        for error in return_type_errors:
+            print(error)
+        sys.exit(1)
     now = str(datetime.now())
     print(
         "Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
diff --git a/docs/openapi_generator/pyopenapi/utility.py b/docs/openapi_generator/pyopenapi/utility.py
index f134aab4b..f60a33bb7 100644
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@@ -6,16 +6,19 @@
 
 import json
 import typing
+import inspect
+import os
 from pathlib import Path
 from typing import TextIO
+from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args
 
 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
+from llama_stack.distribution.resolver import api_protocol_map
 
 from .generator import Generator
 from .options import Options
 from .specification import Document
 
-
 THIS_DIR = Path(__file__).parent
 
 
@@ -114,3 +117,37 @@ class Specification:
         )
 
         f.write(html)
+
+def is_optional_type(type_: Any) -> bool:
+    """Check if a type is Optional."""
+    origin = get_origin(type_)
+    args = get_args(type_)
+    return origin is Optional or (origin is Union and type(None) in args)
+
+
+def validate_api_method_return_types() -> List[str]:
+    """Validate that all API methods have proper return types."""
+    errors = []
+    protocols = api_protocol_map()
+
+    for protocol_name, protocol in protocols.items():
+        methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
+
+        for method_name, method in methods:
+            if not hasattr(method, '__webmethod__'):
+                continue
+
+            # Only check GET methods
+            if method.__webmethod__.method != "GET":
+                continue
+
+            hints = get_type_hints(method)
+
+            if 'return' not in hints:
+                errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
+            else:
+                return_type = hints['return']
+                if is_optional_type(return_type):
+                    errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
+
+    return errors
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 39ba355e9..809af8868 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -52,7 +52,7 @@ class Benchmarks(Protocol):
     async def get_benchmark(
         self,
         benchmark_id: str,
-    ) -> Optional[Benchmark]: ...
+    ) -> Benchmark: ...
 
     @webmethod(route="/eval/benchmarks", method="POST")
     async def register_benchmark(
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index d033d0b70..616371c7d 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -201,7 +201,7 @@ class Datasets(Protocol):
     async def get_dataset(
         self,
         dataset_id: str,
-    ) -> Optional[Dataset]: ...
+    ) -> Dataset: ...
 
     @webmethod(route="/datasets", method="GET")
     async def list_datasets(self) -> ListDatasetsResponse: ...
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index dec018d83..51c38b16a 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -117,7 +117,7 @@ class Eval(Protocol):
         """
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> JobStatus:
         """Get the status of a job.
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
diff --git a/llama_stack/apis/files/files.py b/llama_stack/apis/files/files.py
index f17fadc8c..65c1ead6a 100644
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@@ -115,7 +115,7 @@ class Files(Protocol):
     async def get_upload_session_info(
         self,
         upload_id: str,
-    ) -> Optional[FileUploadResponse]:
+    ) -> FileUploadResponse:
         """
         Returns information about an existsing upload session
 
diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py
index 64b9510ea..893ebc179 100644
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@@ -66,7 +66,7 @@ class Models(Protocol):
     async def get_model(
         self,
         model_id: str,
-    ) -> Optional[Model]: ...
+    ) -> Model: ...
 
     @webmethod(route="/models", method="POST")
     async def register_model(
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index ed15c6de4..636eb7e7b 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -202,10 +202,10 @@ class PostTraining(Protocol):
     async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...
 
     @webmethod(route="/post-training/job/status", method="GET")
-    async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: ...
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse: ...
 
     @webmethod(route="/post-training/job/cancel", method="POST")
     async def cancel_training_job(self, job_uuid: str) -> None: ...
 
     @webmethod(route="/post-training/job/artifacts", method="GET")
-    async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: ...
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse: ...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 52508d2ec..b02a7a0c4 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -135,7 +135,7 @@ class ScoringFunctions(Protocol):
     async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
 
     @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ...
+    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ...
 
     @webmethod(route="/scoring-functions", method="POST")
     async def register_scoring_function(
diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py
index ec1179ac4..67f3bd27b 100644
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@@ -49,7 +49,7 @@ class Shields(Protocol):
     async def list_shields(self) -> ListShieldsResponse: ...
 
     @webmethod(route="/shields/{identifier:path}", method="GET")
-    async def get_shield(self, identifier: str) -> Optional[Shield]: ...
+    async def get_shield(self, identifier: str) -> Shield: ...
 
     @webmethod(route="/shields", method="POST")
     async def register_shield(
diff --git a/llama_stack/apis/vector_dbs/vector_dbs.py b/llama_stack/apis/vector_dbs/vector_dbs.py
index 9a4aa322f..fe6c33919 100644
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@@ -50,7 +50,7 @@ class VectorDBs(Protocol):
     async def get_vector_db(
         self,
         vector_db_id: str,
-    ) -> Optional[VectorDB]: ...
+    ) -> VectorDB: ...
 
     @webmethod(route="/vector-dbs", method="POST")
     async def register_vector_db(
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 533993421..5dea942f7 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -219,8 +219,11 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
     async def list_models(self) -> ListModelsResponse:
         return ListModelsResponse(data=await self.get_all_with_type("model"))
 
-    async def get_model(self, model_id: str) -> Optional[Model]:
-        return await self.get_object_by_identifier("model", model_id)
+    async def get_model(self, model_id: str) -> Model:
+        model = await self.get_object_by_identifier("model", model_id)
+        if model is None:
+            raise ValueError(f"Model '{model_id}' not found")
+        return model
 
     async def register_model(
         self,
@@ -267,8 +270,11 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
     async def list_shields(self) -> ListShieldsResponse:
         return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))
 
-    async def get_shield(self, identifier: str) -> Optional[Shield]:
-        return await self.get_object_by_identifier("shield", identifier)
+    async def get_shield(self, identifier: str) -> Shield:
+        shield = await self.get_object_by_identifier("shield", identifier)
+        if shield is None:
+            raise ValueError(f"Shield '{identifier}' not found")
+        return shield
 
     async def register_shield(
         self,
@@ -303,8 +309,11 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
     async def list_vector_dbs(self) -> ListVectorDBsResponse:
         return ListVectorDBsResponse(data=await self.get_all_with_type("vector_db"))
 
-    async def get_vector_db(self, vector_db_id: str) -> Optional[VectorDB]:
-        return await self.get_object_by_identifier("vector_db", vector_db_id)
+    async def get_vector_db(self, vector_db_id: str) -> VectorDB:
+        vector_db = await self.get_object_by_identifier("vector_db", vector_db_id)
+        if vector_db is None:
+            raise ValueError(f"Vector DB '{vector_db_id}' not found")
+        return vector_db
 
     async def register_vector_db(
         self,
@@ -355,8 +364,11 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
     async def list_datasets(self) -> ListDatasetsResponse:
         return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
 
-    async def get_dataset(self, dataset_id: str) -> Optional[Dataset]:
-        return await self.get_object_by_identifier("dataset", dataset_id)
+    async def get_dataset(self, dataset_id: str) -> Dataset:
+        dataset = await self.get_object_by_identifier("dataset", dataset_id)
+        if dataset is None:
+            raise ValueError(f"Dataset '{dataset_id}' not found")
+        return dataset
 
     async def register_dataset(
         self,
@@ -408,8 +420,11 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
     async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
         return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
 
-    async def get_scoring_function(self, scoring_fn_id: str) -> Optional[ScoringFn]:
-        return await self.get_object_by_identifier("scoring_function", scoring_fn_id)
+    async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
+        scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
+        if scoring_fn is None:
+            raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
+        return scoring_fn
 
     async def register_scoring_function(
         self,
@@ -445,8 +460,11 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
     async def list_benchmarks(self) -> ListBenchmarksResponse:
         return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
 
-    async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
-        return await self.get_object_by_identifier("benchmark", benchmark_id)
+    async def get_benchmark(self, benchmark_id: str) -> Benchmark:
+        benchmark = await self.get_object_by_identifier("benchmark", benchmark_id)
+        if benchmark is None:
+            raise ValueError(f"Benchmark '{benchmark_id}' not found")
+        return benchmark
 
     async def register_benchmark(
         self,
@@ -490,7 +508,10 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
         return ListToolGroupsResponse(data=await self.get_all_with_type("tool_group"))
 
     async def get_tool_group(self, toolgroup_id: str) -> ToolGroup:
-        return await self.get_object_by_identifier("tool_group", toolgroup_id)
+        tool_group = await self.get_object_by_identifier("tool_group", toolgroup_id)
+        if tool_group is None:
+            raise ValueError(f"Tool group '{toolgroup_id}' not found")
+        return tool_group
 
     async def get_tool(self, tool_name: str) -> Tool:
         return await self.get_object_by_identifier("tool", tool_name)

From d609ffce2adef44c9448691085ea9fb9fe57a9b1 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 18 Mar 2025 17:12:17 -0400
Subject: [PATCH 15/16] chore: Add links and badges to both unit and
 integration tests (#1632)

# What does this PR do?

This makes it easier to know the statuses of both and identifying failed
builds.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 .github/workflows/integration-tests.yml | 2 +-
 README.md                               | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 86adf8a15..0af46e1f0 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -1,4 +1,4 @@
-name: Integration tests
+name: Integration Tests
 
 on:
   push:
diff --git a/README.md b/README.md
index aade9c15f..d2adc3376 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,8 @@
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
 [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
-![Unit](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)
+[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
+[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
 
 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
 

From 5ece26297642d99e183c050be153707e8f9828ca Mon Sep 17 00:00:00 2001
From: Sarthak Deshpande <60317842+cheesecake100201@users.noreply.github.com>
Date: Wed, 19 Mar 2025 02:43:46 +0530
Subject: [PATCH 16/16] chore: Make code interpreter async (#1654)

# What does this PR do?
 Made code interpreter tool call to be async such that its non blocking

## Test Plan
pytest -s -v tests/integration/agents/test_agents.py
--stack-config=together --text-model=meta-llama/Llama-3.3-70B-Instruct
<img width="1693" alt="image"
src="https://github.com/user-attachments/assets/42520bb6-7acf-42d5-b71f-b35ca149d722"
/>


[//]: # (## Documentation)

Co-authored-by: sarthakdeshpande <sarthak.deshpande@engati.com>
---
 .../inline/tool_runtime/code_interpreter/code_interpreter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
index 4b97914c5..9610b9b46 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 
+import asyncio
 import logging
 import os
 import tempfile
@@ -37,7 +38,7 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
         pass
 
     async def unregister_tool(self, tool_id: str) -> None:
@@ -65,7 +66,7 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
         # Use environment variable to control bwrap usage
         force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes")
         req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap)
-        res = self.code_executor.execute(req)
+        res = await asyncio.to_thread(self.code_executor.execute, req)
         pieces = [res["process_status"]]
         for out_type in ["stdout", "stderr"]:
             res_out = res[out_type]