fix: remote vLLM tool execution now works when the last chunk contains the call arguments (#2112)

# What does this PR do? Closes #2111. Fixes an error causing Llama Stack to just return `<tool_call>` and complete the turn without actually executing the tool. See the issue description for more detail. ## Test Plan 1) Ran existing unit tests 2) Added a dedicated test verifying correct behavior in this edge case 3) Ran the code snapshot from #2111
2025-05-14 11:38:00 +02:00 · 2025-05-14 11:38:00 +02:00 · 43d4447ff0
commit 43d4447ff0
parent 1de0dfaab5
2 changed files with 87 additions and 7 deletions
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -168,6 +168,12 @@ async def _process_vllm_chat_completion_stream_response(
            log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
            continue
        choice = chunk.choices[0]
+        if choice.delta.tool_calls:
+            tool_call = convert_tool_call(choice.delta.tool_calls[0])
+            tool_call_buf.tool_name += str(tool_call.tool_name)
+            tool_call_buf.call_id += tool_call.call_id
+            # TODO: remove str() when dict type for 'arguments' is no longer allowed
+            tool_call_buf.arguments += str(tool_call.arguments)
        if choice.finish_reason:
            args_str = tool_call_buf.arguments
            args = None
@ -208,13 +214,7 @@ async def _process_vllm_chat_completion_stream_response(
                    stop_reason=_convert_to_vllm_finish_reason(choice.finish_reason),
                )
            )
-        elif choice.delta.tool_calls:
-            tool_call = convert_tool_call(choice.delta.tool_calls[0])
-            tool_call_buf.tool_name += str(tool_call.tool_name)
-            tool_call_buf.call_id += tool_call.call_id
-            # TODO: remove str() when dict type for 'arguments' is no longer allowed
-            tool_call_buf.arguments += str(tool_call.arguments)
-        else:
+        elif not choice.delta.tool_calls:
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
                    event_type=event_type,