mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-16 23:03:49 +00:00
chore: update the vLLM inference impl to use OpenAIMixin for openai-compat functions
inference recordings from Qwen3-0.6B and vLLM 0.8.3 - ``` docker run --gpus all -v ~/.cache/huggingface:/root/.cache/huggingface -p 8000:8000 --ipc=host \ vllm/vllm-openai:latest \ --model Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes ``` test with - ``` ./scripts/integration-tests.sh --stack-config server:ci-tests --setup vllm --subdirs inference ```
This commit is contained in:
parent
c86e45496e
commit
c2a9c65fff
33 changed files with 51813 additions and 203 deletions
105
tests/integration/recordings/responses/033340f58ebc.json
Normal file
105
tests/integration/recordings/responses/033340f58ebc.json
Normal file
|
@ -0,0 +1,105 @@
|
|||
{
|
||||
"request": {
|
||||
"method": "POST",
|
||||
"url": "http://localhost:8000/v1/v1/chat/completions",
|
||||
"headers": {},
|
||||
"body": {
|
||||
"model": "Qwen/Qwen3-0.6B",
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state (both required), e.g. San Francisco, CA."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"location"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Pretend you are a weather assistant."
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's the weather like in San Francisco, CA?"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"stream": false,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 4096
|
||||
},
|
||||
"endpoint": "/v1/chat/completions",
|
||||
"model": "Qwen/Qwen3-0.6B"
|
||||
},
|
||||
"response": {
|
||||
"body": {
|
||||
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
|
||||
"__data__": {
|
||||
"id": "chatcmpl-3d55352696e445f598c3d881a1130454",
|
||||
"choices": [
|
||||
{
|
||||
"finish_reason": "tool_calls",
|
||||
"index": 0,
|
||||
"logprobs": null,
|
||||
"message": {
|
||||
"content": "<think>\nOkay, the user is asking about the weather in San Francisco, CA. I need to use the get_weather function. The function requires the location parameter, which in this case is \"San Francisco, CA\". I should make sure to format the arguments correctly as a JSON object. Let me check the required parameters again to confirm. The location is required, so I'll include that. No other parameters are needed. Alright, I'll structure the tool call accordingly.\n</think>\n\n",
|
||||
"refusal": null,
|
||||
"role": "assistant",
|
||||
"annotations": null,
|
||||
"audio": null,
|
||||
"function_call": null,
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "chatcmpl-tool-98de38728cf6486d925102e0515d6d8f",
|
||||
"function": {
|
||||
"arguments": "{\"location\": \"San Francisco, CA\"}",
|
||||
"name": "get_weather"
|
||||
},
|
||||
"type": "function"
|
||||
}
|
||||
],
|
||||
"reasoning_content": null
|
||||
},
|
||||
"stop_reason": null
|
||||
}
|
||||
],
|
||||
"created": 1757524137,
|
||||
"model": "Qwen/Qwen3-0.6B",
|
||||
"object": "chat.completion",
|
||||
"service_tier": null,
|
||||
"system_fingerprint": null,
|
||||
"usage": {
|
||||
"completion_tokens": 120,
|
||||
"prompt_tokens": 185,
|
||||
"total_tokens": 305,
|
||||
"completion_tokens_details": null,
|
||||
"prompt_tokens_details": null
|
||||
},
|
||||
"prompt_logprobs": null
|
||||
}
|
||||
},
|
||||
"is_streaming": false
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue