Fixed an "out of token budget" tool execution bug in the remote vLLM provider.

This commit is contained in:
ilya-kolchinsky 2025-05-08 10:42:26 +02:00
parent fe5f5e530c
commit 7784307a5f
2 changed files with 141 additions and 39 deletions

View file

@ -28,6 +28,7 @@ from openai.types.model import Model as OpenAIModel
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponseEventType,
CompletionMessage,
SystemMessage,
ToolChoice,
@ -294,3 +295,57 @@ async def test_get_params_empty_tools(vllm_inference_adapter):
)
params = await vllm_inference_adapter._get_params(request)
assert "tools" not in params
@pytest.mark.asyncio
async def test_process_vllm_chat_completion_stream_response_no_finish_reason():
"""
Tests the edge case where the model requests a tool call and stays idle without explicitly providing the
finish reason.
We want to make sure that this case is recognized and handled correctly, i.e., as a valid end of message.
"""
mock_tool_name = "mock_tool"
mock_tool_arguments = {"arg1": 0, "arg2": 100}
mock_tool_arguments_str = '"{\\"arg1\\": 0, \\"arg2\\": 100}"'
async def mock_stream():
mock_chunks = [
OpenAIChatCompletionChunk(
id="chunk-1",
created=1,
model="foo",
object="chat.completion.chunk",
choices=[
{
"delta": {
"content": None,
"tool_calls": [
{
"index": 0,
"id": "mock_id",
"type": "function",
"function": {
"name": mock_tool_name,
"arguments": mock_tool_arguments_str,
},
}
],
},
"finish_reason": None,
"logprobs": None,
"index": 0,
}
],
),
]
for chunk in mock_chunks:
print(f"Test chunk:\n{chunk}")
yield chunk
chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
assert len(chunks) == 2
assert chunks[-1].event.event_type == ChatCompletionResponseEventType.complete
assert chunks[-2].event.delta.type == "tool_call"
assert chunks[-2].event.delta.tool_call.tool_name == mock_tool_name
assert chunks[-2].event.delta.tool_call.arguments == mock_tool_arguments