featu: support passing "extra body" throught to providers

# What does this PR do? Allows passing through extra_body parameters to inference providers. closes #2720 ## Test Plan CI and added new test
2025-10-12 13:57:57 +00:00 · 2025-10-10 15:37:57 -07:00 · 2025-10-10 15:37:57 -07:00 · bb5dc85012
commit bb5dc85012
parent cb7fb0705b
35 changed files with 1895 additions and 197 deletions
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
+from typing import Any
 from urllib.parse import urljoin

 import httpx
@ -14,7 +15,7 @@ from pydantic import ConfigDict

 from llama_stack.apis.inference import (
    OpenAIChatCompletion,
-    OpenAIChatCompletionRequest,
+    OpenAIChatCompletionRequestWithExtraBody,
    ToolChoice,
 )
 from llama_stack.log import get_logger
@ -93,7 +94,7 @@ class VLLMInferenceAdapter(OpenAIMixin):

    async def openai_chat_completion(
        self,
-        params: OpenAIChatCompletionRequest,
+        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        params = params.model_copy()