refactor

# What does this PR do? ## Test Plan
2025-10-12 05:54:38 +00:00 · 2025-10-10 10:55:28 -07:00 · 2025-10-10 10:55:28 -07:00 · ab7888e927
commit ab7888e927
parent 548ccff368
32 changed files with 571 additions and 885 deletions
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -13,6 +13,7 @@ import pytest
 from llama_stack.apis.inference import (
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
+    OpenaiChatCompletionRequest,
    OpenAIChoice,
    ToolChoice,
 )
@ -56,13 +57,14 @@ async def test_old_vllm_tool_choice(vllm_inference_adapter):
        mock_client_property.return_value = mock_client

        # No tools but auto tool choice
-        await vllm_inference_adapter.openai_chat_completion(
-            "mock-model",
-            [],
+        params = OpenaiChatCompletionRequest(
+            model="mock-model",
+            messages=[{"role": "user", "content": "test"}],
            stream=False,
            tools=None,
            tool_choice=ToolChoice.auto.value,
        )
+        await vllm_inference_adapter.openai_chat_completion(params)
        mock_client.chat.completions.create.assert_called()
        call_args = mock_client.chat.completions.create.call_args
        # Ensure tool_choice gets converted to none for older vLLM versions
@ -171,9 +173,12 @@ async def test_openai_chat_completion_is_async(vllm_inference_adapter):
        )

    async def do_inference():
-        await vllm_inference_adapter.openai_chat_completion(
-            "mock-model", messages=["one fish", "two fish"], stream=False
+        params = OpenaiChatCompletionRequest(
+            model="mock-model",
+            messages=[{"role": "user", "content": "one fish two fish"}],
+            stream=False,
        )
+        await vllm_inference_adapter.openai_chat_completion(params)

    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
        mock_client = MagicMock()