test

# What does this PR do? ## Test Plan
2025-10-11 05:38:38 +00:00 · 2025-10-09 20:53:19 -07:00 · 2025-10-09 20:53:19 -07:00 · 4a3d1e33f8
commit 4a3d1e33f8
parent f50ce11a3b
31 changed files with 727 additions and 892 deletions
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
-from typing import Any
 from urllib.parse import urljoin

 import httpx
@ -15,8 +14,7 @@ from pydantic import ConfigDict

 from llama_stack.apis.inference import (
    OpenAIChatCompletion,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
+    OpenaiChatCompletionRequest,
    ToolChoice,
 )
 from llama_stack.log import get_logger
@ -79,61 +77,20 @@ class VLLMInferenceAdapter(OpenAIMixin):

    async def openai_chat_completion(
        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
+        params: "OpenaiChatCompletionRequest",
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        max_tokens = max_tokens or self.config.max_tokens
+        # Copy params to avoid mutating the original
+        params = params.model_copy()
+
+        # Apply vLLM-specific defaults
+        if params.max_tokens is None and self.config.max_tokens:
+            params.max_tokens = self.config.max_tokens

        # This is to be consistent with OpenAI API and support vLLM <= v0.6.3
        # References:
        #   * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
        #   * https://github.com/vllm-project/vllm/pull/10000
-        if not tools and tool_choice is not None:
-            tool_choice = ToolChoice.none.value
+        if not params.tools and params.tool_choice is not None:
+            params.tool_choice = ToolChoice.none.value

-        return await super().openai_chat_completion(
-            model=model,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
+        return await super().openai_chat_completion(params)