Fixed an "out of token budget" tool execution bug in the remote vLLM provider.

2025-12-28 04:31:59 +00:00 · 2025-05-08 10:42:26 +02:00 · 2025-05-08 10:42:26 +02:00 · 7784307a5f
commit 7784307a5f
parent fe5f5e530c
2 changed files with 141 additions and 39 deletions
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -28,6 +28,7 @@ from openai.types.model import Model as OpenAIModel

 from llama_stack.apis.inference import (
    ChatCompletionRequest,
+    ChatCompletionResponseEventType,
    CompletionMessage,
    SystemMessage,
    ToolChoice,
@ -294,3 +295,57 @@ async def test_get_params_empty_tools(vllm_inference_adapter):
    )
    params = await vllm_inference_adapter._get_params(request)
    assert "tools" not in params
+
+
+@pytest.mark.asyncio
+async def test_process_vllm_chat_completion_stream_response_no_finish_reason():
+    """
+    Tests the edge case where the model requests a tool call and stays idle without explicitly providing the
+    finish reason.
+    We want to make sure that this case is recognized and handled correctly, i.e., as a valid end of message.
+    """
+
+    mock_tool_name = "mock_tool"
+    mock_tool_arguments = {"arg1": 0, "arg2": 100}
+    mock_tool_arguments_str = '"{\\"arg1\\": 0, \\"arg2\\": 100}"'
+
+    async def mock_stream():
+        mock_chunks = [
+            OpenAIChatCompletionChunk(
+                id="chunk-1",
+                created=1,
+                model="foo",
+                object="chat.completion.chunk",
+                choices=[
+                    {
+                        "delta": {
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": "mock_id",
+                                    "type": "function",
+                                    "function": {
+                                        "name": mock_tool_name,
+                                        "arguments": mock_tool_arguments_str,
+                                    },
+                                }
+                            ],
+                        },
+                        "finish_reason": None,
+                        "logprobs": None,
+                        "index": 0,
+                    }
+                ],
+            ),
+        ]
+        for chunk in mock_chunks:
+            print(f"Test chunk:\n{chunk}")
+            yield chunk
+
+    chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
+    assert len(chunks) == 2
+    assert chunks[-1].event.event_type == ChatCompletionResponseEventType.complete
+    assert chunks[-2].event.delta.type == "tool_call"
+    assert chunks[-2].event.delta.tool_call.tool_name == mock_tool_name
+    assert chunks[-2].event.delta.tool_call.arguments == mock_tool_arguments