From 63cce5673a30abbf2278f83ec746d691cf3b0a3a Mon Sep 17 00:00:00 2001
From: ilya-kolchinsky <ilya.kolchinsky@gmail.com>
Date: Wed, 14 May 2025 12:39:32 +0200
Subject: [PATCH] Resolving merge conflicts.

---
 .../providers/remote/inference/vllm/vllm.py   | 14 ++--
 .../providers/inference/test_remote_vllm.py   | 79 +++++++++++++++++++
 2 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 049eb4fcf..070d94df8 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -234,6 +234,12 @@ async def _process_vllm_chat_completion_stream_response(
             log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
             return
         choice = chunk.choices[0]
+        if choice.delta.tool_calls:
+            tool_call = convert_tool_call(choice.delta.tool_calls[0])
+            tool_call_buf.tool_name += str(tool_call.tool_name)
+            tool_call_buf.call_id += tool_call.call_id
+            # TODO: remove str() when dict type for 'arguments' is no longer allowed
+            tool_call_buf.arguments += str(tool_call.arguments)
         if choice.finish_reason:
             chunks = _process_vllm_chat_completion_end_of_stream(
                 finish_reason=choice.finish_reason,
@@ -244,13 +250,7 @@ async def _process_vllm_chat_completion_stream_response(
             for c in chunks:
                 yield c
             end_of_stream_processed = True
-        elif choice.delta.tool_calls:
-            tool_call = convert_tool_call(choice.delta.tool_calls[0])
-            tool_call_buf.tool_name += str(tool_call.tool_name)
-            tool_call_buf.call_id += tool_call.call_id
-            # TODO: remove str() when dict type for 'arguments' is no longer allowed
-            tool_call_buf.arguments += str(tool_call.arguments)
-        else:
+        elif not choice.delta.tool_calls:
             yield ChatCompletionResponseStreamChunk(
                 event=ChatCompletionResponseEvent(
                     event_type=event_type,
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index 5c9bda74a..6e1623131 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -297,6 +297,85 @@ async def test_get_params_empty_tools(vllm_inference_adapter):
     assert "tools" not in params
 
 
+@pytest.mark.asyncio
+async def test_process_vllm_chat_completion_stream_response_tool_call_args_last_chunk():
+    """
+    Tests the edge case where the model returns the arguments for the tool call in the same chunk that
+    contains the finish reason (i.e., the last one).
+    We want to make sure the tool call is executed in this case, and the parameters are passed correctly.
+    """
+
+    mock_tool_name = "mock_tool"
+    mock_tool_arguments = {"arg1": 0, "arg2": 100}
+    mock_tool_arguments_str = json.dumps(mock_tool_arguments)
+
+    async def mock_stream():
+        mock_chunks = [
+            OpenAIChatCompletionChunk(
+                id="chunk-1",
+                created=1,
+                model="foo",
+                object="chat.completion.chunk",
+                choices=[
+                    {
+                        "delta": {
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": "mock_id",
+                                    "type": "function",
+                                    "function": {
+                                        "name": mock_tool_name,
+                                        "arguments": None,
+                                    },
+                                }
+                            ],
+                        },
+                        "finish_reason": None,
+                        "logprobs": None,
+                        "index": 0,
+                    }
+                ],
+            ),
+            OpenAIChatCompletionChunk(
+                id="chunk-1",
+                created=1,
+                model="foo",
+                object="chat.completion.chunk",
+                choices=[
+                    {
+                        "delta": {
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": None,
+                                    "function": {
+                                        "name": None,
+                                        "arguments": mock_tool_arguments_str,
+                                    },
+                                }
+                            ],
+                        },
+                        "finish_reason": "tool_calls",
+                        "logprobs": None,
+                        "index": 0,
+                    }
+                ],
+            ),
+        ]
+        for chunk in mock_chunks:
+            yield chunk
+
+    chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
+    assert len(chunks) == 2
+    assert chunks[-1].event.event_type == ChatCompletionResponseEventType.complete
+    assert chunks[-2].event.delta.type == "tool_call"
+    assert chunks[-2].event.delta.tool_call.tool_name == mock_tool_name
+    assert chunks[-2].event.delta.tool_call.arguments == mock_tool_arguments
+
+
 @pytest.mark.asyncio
 async def test_process_vllm_chat_completion_stream_response_no_finish_reason():
     """