From 40e71758d97e5df0f233f3f82b873d38712488a1 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Wed, 7 May 2025 08:34:47 -0400
Subject: [PATCH] fix: inference providers still using tools with
 `tool_choice="none"` (#2048)

# What does this PR do?

In our OpenAI API verification tests, some providers were still calling
tools even when `tool_choice="none"` was passed in the chat completion
requests. Because they aren't all respecting `tool_choice` properly,
this adjusts our routing implementation to remove the `tools` and
`tool_choice` from the request if `tool_choice="none"` is passed in so
that it does not attempt to call any of those tools. Adjusting this in
the router fixes this across all providers.

This also cleans up the non-streaming together.ai responses for tools,
ensuring it returns `None` instead of an empty list when there are no
tool calls, to exactly match the OpenAI API responses in that case.

## Test Plan

I observed existing failures in our OpenAI API verification suite - see

https://github.com/bbrowning/llama-stack-tests/blob/main/openai-api-verification/2025-04-27.md#together-llama-stack
for the failing `test_chat_*_tool_choice_none` tests. All streaming and
non-streaming variants were failing across all 3 tested models.

After this change, all of those 6 failing tests are now passing with no
regression in the other tests.

I verified this via:

```
llama stack run --image-type venv \
  tests/verifications/openai-api-verification-run.yaml
```

```
python -m pytest -s -v \
  'tests/verifications/openai_api/test_chat_completion.py' \
  --provider=together-llama-stack
```

The entire verification suite is not 100% on together.ai yet, but it's
getting closer.

This also increased the pass rate for fireworks.ai, and did not regress
the groq or openai tests at all.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 llama_stack/distribution/routers/routers.py   | 20 ++++++++++++++++++-
 .../remote/inference/ollama/ollama.py         |  6 ------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 737a384d8..7e80a067f 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -573,6 +573,12 @@ class InferenceRouter(Inference):
             for tool in tools:
                 TypeAdapter(OpenAIChatCompletionToolParam).validate_python(tool)
 
+        # Some providers make tool calls even when tool_choice is "none"
+        # so just clear them both out to avoid unexpected tool calls
+        if tool_choice == "none" and tools is not None:
+            tool_choice = None
+            tools = None
+
         params = dict(
             model=model_obj.identifier,
             messages=messages,
@@ -600,7 +606,19 @@ class InferenceRouter(Inference):
         )
 
         provider = self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.openai_chat_completion(**params)
+        if stream:
+            return await provider.openai_chat_completion(**params)
+        else:
+            return await self._nonstream_openai_chat_completion(provider, params)
+
+    async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion:
+        response = await provider.openai_chat_completion(**params)
+        for choice in response.choices:
+            # some providers return an empty list for no tool calls in non-streaming responses
+            # but the OpenAI API returns None. So, set tool_calls to None if it's empty
+            if choice.message and choice.message.tool_calls is not None and len(choice.message.tool_calls) == 0:
+                choice.message.tool_calls = None
+        return response
 
     async def health(self) -> dict[str, HealthResponse]:
         health_statuses = {}
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index d0d45b429..32e2b17d0 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -447,12 +447,6 @@ class OllamaInferenceAdapter(
         user: str | None = None,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self._get_model(model)
-
-        # ollama still makes tool calls even when tool_choice is "none"
-        # so we need to remove the tools in that case
-        if tool_choice == "none" and tools is not None:
-            tools = None
-
         params = {
             k: v
             for k, v in {