diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index f9bcb48f7..61fc8966a 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -8097,17 +8097,6 @@ "type": "string", "description": "(Optional) The user to use." }, - "guided_choice": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." - }, - "prompt_logprobs": { - "type": "integer", - "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." - }, "suffix": { "type": "string", "description": "(Optional) The suffix that should be appended to the completion." diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 552555f7a..2aaada447 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -5973,18 +5973,6 @@ components: user: type: string description: (Optional) The user to use. - guided_choice: - type: array - items: - type: string - description: >- - (Optional) vLLM-specific parameter for guided generation with a list of - choices. - prompt_logprobs: - type: integer - description: >- - (Optional) vLLM-specific parameter for number of log probabilities to - return for prompt tokens. suffix: type: string description: >- diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 8f8ff66c9..8b0b47394 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -5593,17 +5593,6 @@ "type": "string", "description": "(Optional) The user to use." }, - "guided_choice": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." - }, - "prompt_logprobs": { - "type": "integer", - "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." - }, "suffix": { "type": "string", "description": "(Optional) The suffix that should be appended to the completion." diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 97742f19a..d6b16b517 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -4222,18 +4222,6 @@ components: user: type: string description: (Optional) The user to use. - guided_choice: - type: array - items: - type: string - description: >- - (Optional) vLLM-specific parameter for guided generation with a list of - choices. - prompt_logprobs: - type: integer - description: >- - (Optional) vLLM-specific parameter for number of log probabilities to - return for prompt tokens. suffix: type: string description: >- diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index fcdcd76c5..33a5bfdf5 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -7602,17 +7602,6 @@ "type": "string", "description": "(Optional) The user to use." }, - "guided_choice": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." - }, - "prompt_logprobs": { - "type": "integer", - "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." - }, "suffix": { "type": "string", "description": "(Optional) The suffix that should be appended to the completion." diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 09fc3ded4..968fca37d 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -5667,18 +5667,6 @@ components: user: type: string description: (Optional) The user to use. - guided_choice: - type: array - items: - type: string - description: >- - (Optional) vLLM-specific parameter for guided generation with a list of - choices. - prompt_logprobs: - type: integer - description: >- - (Optional) vLLM-specific parameter for number of log probabilities to - return for prompt tokens. suffix: type: string description: >- diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index fb3e78afc..4570eaa71 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -1058,8 +1058,6 @@ class OpenAICompletionRequest(BaseModel): :param top_p: (Optional) The top p to use. :param user: (Optional) The user to use. :param suffix: (Optional) The suffix that should be appended to the completion. - :param guided_choice: (Optional) vLLM-specific parameter for guided generation with a list of choices. - :param prompt_logprobs: (Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens. """ model_config = ConfigDict(extra="allow") @@ -1082,12 +1080,6 @@ class OpenAICompletionRequest(BaseModel): temperature: float | None = None top_p: float | None = None user: str | None = None - - # vLLM-specific parameters (documented here but also allowed via extra fields) - guided_choice: list[str] | None = None - prompt_logprobs: int | None = None - - # for fill-in-the-middle type completion suffix: str | None = None diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 5c7532e70..1a02f7a12 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -194,12 +194,13 @@ class InferenceRouter(Inference): params.model = model_obj.identifier provider = await self.routing_table.get_provider_impl(model_obj.identifier) + extra_body = dict(params.__pydantic_extra__ or {}) if params.stream: - return await provider.openai_completion(params) + return await provider.openai_completion(params, **extra_body) # TODO: Metrics do NOT work with openai_completion stream=True due to the fact # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently. - response = await provider.openai_completion(params) + response = await provider.openai_completion(params, **extra_body) if self.telemetry: metrics = self._construct_metrics( prompt_tokens=response.usage.prompt_tokens, @@ -246,7 +247,8 @@ class InferenceRouter(Inference): provider = await self.routing_table.get_provider_impl(model_obj.identifier) if params.stream: - response_stream = await provider.openai_chat_completion(params) + extra_body = dict(params.__pydantic_extra__ or {}) + response_stream = await provider.openai_chat_completion(params, **extra_body) # For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk] # We need to add metrics to each chunk and store the final completion @@ -319,7 +321,8 @@ class InferenceRouter(Inference): async def _nonstream_openai_chat_completion( self, provider: Inference, params: OpenAIChatCompletionRequest ) -> OpenAIChatCompletion: - response = await provider.openai_chat_completion(params) + extra_body = dict(params.__pydantic_extra__ or {}) + response = await provider.openai_chat_completion(params, **extra_body) for choice in response.choices: # some providers return an empty list for no tool calls in non-streaming responses # but the OpenAI API returns None. So, set tool_calls to None if it's empty diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 512913226..c6d9a7ec6 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from collections.abc import Iterable +from typing import Any from databricks.sdk import WorkspaceClient @@ -40,5 +41,6 @@ class DatabricksInferenceAdapter(OpenAIMixin): async def openai_completion( self, params: OpenAICompletionRequest, + **kwargs: Any, ) -> OpenAICompletion: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index 5a8bdd55e..12f58e919 100644 --- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -3,6 +3,8 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any + from llama_stack.apis.inference.inference import OpenAICompletion, OpenAICompletionRequest, OpenAIEmbeddingsResponse from llama_stack.log import get_logger from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig @@ -30,6 +32,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin): async def openai_completion( self, params: OpenAICompletionRequest, + **kwargs: Any, ) -> OpenAICompletion: raise NotImplementedError() @@ -40,5 +43,6 @@ class LlamaCompatInferenceAdapter(OpenAIMixin): encoding_format: str | None = "float", dimensions: int | None = None, user: str | None = None, + **kwargs: Any, ) -> OpenAIEmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index b09326271..36b7d43d9 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from collections.abc import AsyncIterator +from typing import Any from urllib.parse import urljoin import httpx @@ -94,6 +95,7 @@ class VLLMInferenceAdapter(OpenAIMixin): async def openai_chat_completion( self, params: OpenAIChatCompletionRequest, + **kwargs: Any, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: params = params.model_copy() @@ -108,4 +110,4 @@ class VLLMInferenceAdapter(OpenAIMixin): if not params.tools and params.tool_choice is not None: params.tool_choice = ToolChoice.none.value - return await super().openai_chat_completion(params) + return await super().openai_chat_completion(params, **kwargs) diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index eed078a0e..d35652b97 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -248,8 +248,6 @@ class LiteLLMOpenAIMixin( temperature=params.temperature, top_p=params.top_p, user=params.user, - guided_choice=params.guided_choice, - prompt_logprobs=params.prompt_logprobs, suffix=params.suffix, api_key=self.get_api_key(), api_base=self.api_base, diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py index 502bc207b..a7b54f2af 100644 --- a/llama_stack/providers/utils/inference/openai_mixin.py +++ b/llama_stack/providers/utils/inference/openai_mixin.py @@ -224,20 +224,11 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): async def openai_completion( self, params: OpenAICompletionRequest, + **kwargs: Any, ) -> OpenAICompletion: """ Direct OpenAI completion API call. """ - # Handle parameters that are not supported by OpenAI API, but may be by the provider - # prompt_logprobs is supported by vLLM - # guided_choice is supported by vLLM - # TODO: test coverage - extra_body: dict[str, Any] = {} - if params.prompt_logprobs is not None and params.prompt_logprobs >= 0: - extra_body["prompt_logprobs"] = params.prompt_logprobs - if params.guided_choice: - extra_body["guided_choice"] = params.guided_choice - # TODO: fix openai_completion to return type compatible with OpenAI's API response completion_kwargs = await prepare_openai_completion_params( model=await self._get_provider_model_id(params.model), @@ -259,13 +250,16 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): user=params.user, suffix=params.suffix, ) - resp = await self.client.completions.create(**completion_kwargs, extra_body=extra_body) + if extra_body := kwargs: + completion_kwargs["extra_body"] = extra_body + resp = await self.client.completions.create(**completion_kwargs) return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return] async def openai_chat_completion( self, params: OpenAIChatCompletionRequest, + **kwargs: Any, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """ Direct OpenAI chat completion API call. @@ -316,6 +310,8 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): user=params.user, ) + if extra_body := kwargs: + request_params["extra_body"] = extra_body resp = await self.client.chat.completions.create(**request_params) return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return] diff --git a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json new file mode 100644 index 000000000..2d89edb5a --- /dev/null +++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json @@ -0,0 +1,881 @@ +{ + "test_id": null, + "request": { + "method": "POST", + "url": "https://api.openai.com/v1/v1/models", + "headers": {}, + "body": {}, + "endpoint": "/v1/models", + "model": "" + }, + "response": { + "body": [ + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-0613", + "created": 1686588896, + "object": "model", + "owned_by": "openai" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4", + "created": 1687882411, + "object": "model", + "owned_by": "openai" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo", + "created": 1677610602, + "object": "model", + "owned_by": "openai" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "sora-2-pro", + "created": 1759708663, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio-mini-2025-10-06", + "created": 1759512137, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime-mini", + "created": 1759517133, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime-mini-2025-10-06", + "created": 1759517175, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "sora-2", + "created": 1759708615, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "davinci-002", + "created": 1692634301, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "babbage-002", + "created": 1692634615, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-instruct", + "created": 1692901427, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-instruct-0914", + "created": 1694122472, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "dall-e-3", + "created": 1698785189, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "dall-e-2", + "created": 1698798177, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-1106-preview", + "created": 1698957206, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-1106", + "created": 1698959748, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1-hd", + "created": 1699046015, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1-1106", + "created": 1699053241, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1-hd-1106", + "created": 1699053533, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "text-embedding-3-small", + "created": 1705948997, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "text-embedding-3-large", + "created": 1705953180, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-0125-preview", + "created": 1706037612, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-turbo-preview", + "created": 1706037777, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-0125", + "created": 1706048358, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-turbo", + "created": 1712361441, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-turbo-2024-04-09", + "created": 1712601677, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o", + "created": 1715367049, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-2024-05-13", + "created": 1715368132, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-2024-07-18", + "created": 1721172717, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini", + "created": 1721172741, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-2024-08-06", + "created": 1722814719, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "chatgpt-4o-latest", + "created": 1723515131, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-mini-2024-09-12", + "created": 1725648979, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-mini", + "created": 1725649008, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview-2024-10-01", + "created": 1727131766, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview-2024-10-01", + "created": 1727389042, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview", + "created": 1727460443, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview", + "created": 1727659998, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "omni-moderation-latest", + "created": 1731689265, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "omni-moderation-2024-09-26", + "created": 1732734466, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview-2024-12-17", + "created": 1733945430, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview-2024-12-17", + "created": 1734034239, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-realtime-preview-2024-12-17", + "created": 1734112601, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-audio-preview-2024-12-17", + "created": 1734115920, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-2024-12-17", + "created": 1734326976, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1", + "created": 1734375816, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-realtime-preview", + "created": 1734387380, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-audio-preview", + "created": 1734387424, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3-mini", + "created": 1737146383, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3-mini-2025-01-31", + "created": 1738010200, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-2024-11-20", + "created": 1739331543, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-search-preview-2025-03-11", + "created": 1741388170, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-search-preview", + "created": 1741388720, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-search-preview-2025-03-11", + "created": 1741390858, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-search-preview", + "created": 1741391161, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-transcribe", + "created": 1742068463, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-transcribe", + "created": 1742068596, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-pro-2025-03-19", + "created": 1742251504, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-pro", + "created": 1742251791, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-tts", + "created": 1742403959, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3-2025-04-16", + "created": 1744133301, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini-2025-04-16", + "created": 1744133506, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3", + "created": 1744225308, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini", + "created": 1744225351, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-2025-04-14", + "created": 1744315746, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1", + "created": 1744316542, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-mini-2025-04-14", + "created": 1744317547, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-mini", + "created": 1744318173, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-nano-2025-04-14", + "created": 1744321025, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-nano", + "created": 1744321707, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-image-1", + "created": 1745517030, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "codex-mini-latest", + "created": 1746673257, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview-2025-06-03", + "created": 1748907838, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview-2025-06-03", + "created": 1748908498, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini-deep-research", + "created": 1749685485, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini-deep-research-2025-06-26", + "created": 1750866121, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-chat-latest", + "created": 1754073306, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-2025-08-07", + "created": 1754075360, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5", + "created": 1754425777, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-mini-2025-08-07", + "created": 1754425867, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-mini", + "created": 1754425928, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-nano-2025-08-07", + "created": 1754426303, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-nano", + "created": 1754426384, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio-2025-08-28", + "created": 1756256146, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime", + "created": 1756271701, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime-2025-08-28", + "created": 1756271773, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio", + "created": 1756339249, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-codex", + "created": 1757527818, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-image-1-mini", + "created": 1758845821, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-pro-2025-10-06", + "created": 1759469707, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-pro", + "created": 1759469822, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio-mini", + "created": 1759512027, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-16k", + "created": 1683758102, + "object": "model", + "owned_by": "openai-internal" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1", + "created": 1681940951, + "object": "model", + "owned_by": "openai-internal" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "whisper-1", + "created": 1677532384, + "object": "model", + "owned_by": "openai-internal" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "text-embedding-ada-002", + "created": 1671217299, + "object": "model", + "owned_by": "openai-internal" + } + } + ], + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json new file mode 100644 index 000000000..4d3e7b685 --- /dev/null +++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json @@ -0,0 +1,45 @@ +{ + "test_id": null, + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/models", + "headers": {}, + "body": {}, + "endpoint": "/v1/models", + "model": "" + }, + "response": { + "body": [ + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "Qwen/Qwen3-0.6B", + "created": 1760133400, + "object": "model", + "owned_by": "vllm", + "root": "Qwen/Qwen3-0.6B", + "parent": null, + "max_model_len": 4096, + "permission": [ + { + "id": "modelperm-c53f50aaa5e8413ca316dd27c5867394", + "object": "model_permission", + "created": 1760133400, + "allow_create_engine": false, + "allow_sampling": true, + "allow_logprobs": true, + "allow_search_indices": false, + "allow_view": true, + "allow_fine_tuning": false, + "organization": "*", + "group": null, + "is_blocking": false + } + ] + } + } + ], + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json b/tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json new file mode 100644 index 000000000..84e8eec92 --- /dev/null +++ b/tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json @@ -0,0 +1,543 @@ +{ + "test_id": null, + "request": { + "method": "POST", + "url": "https://api.fireworks.ai/inference/v1/v1/models", + "headers": {}, + "body": {}, + "endpoint": "/v1/models", + "model": "" + }, + "response": { + "body": [ + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-1-dev-fp8", + "created": 1729532889, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": false, + "supports_image_input": false, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-kontext-max", + "created": 1750714611, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-kontext-pro", + "created": 1750488264, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/sentientfoundation-serverless/models/dobby-mini-unhinged-plus-llama-3-1-8b", + "created": 1748467427, + "object": "model", + "owned_by": "sentientfoundation-serverless", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/sentientfoundation/models/dobby-unhinged-llama-3-3-70b-new", + "created": 1739563474, + "object": "model", + "owned_by": "sentientfoundation", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/gpt-oss-120b", + "created": 1754345600, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507", + "created": 1753124424, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-235b-a22b-thinking-2507", + "created": 1753455434, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3-0324", + "created": 1742827220, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/kimi-k2-instruct", + "created": 1752259096, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/gpt-oss-20b", + "created": 1754345466, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/kimi-k2-instruct-0905", + "created": 1757018994, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "created": 1733442103, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-235b-a22b", + "created": 1745885249, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/glm-4p5-air", + "created": 1754089426, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3p1", + "created": 1755758988, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-1-schnell-fp8", + "created": 1729535376, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": false, + "supports_image_input": false, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p1-405b-instruct", + "created": 1721428386, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama4-scout-instruct-basic", + "created": 1743878279, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": true, + "context_length": 1048576 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-30b-a3b", + "created": 1745878133, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p1-70b-instruct", + "created": 1721287357, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-r1-0528", + "created": 1748456377, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/mixtral-8x22b-instruct", + "created": 1713375508, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 65536 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "created": 1743878495, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": true, + "context_length": 1048576 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct", + "created": 1743392739, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": false, + "context_length": 128000 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3p1-terminus", + "created": 1758586241, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p1-8b-instruct", + "created": 1721692808, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-coder-480b-a35b-instruct", + "created": 1753211090, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-30b-a3b-thinking-2507", + "created": 1753916446, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-embedding-8b", + "created": 1755707090, + "object": "model", + "owned_by": "fireworks", + "kind": "EMBEDDING_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 40960 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-reranker-8b", + "created": 1759865045, + "object": "model", + "owned_by": "fireworks", + "kind": "EMBEDDING_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 40960 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/glm-4p5", + "created": 1753809636, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct", + "created": 1754063588, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-r1", + "created": 1737397673, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3", + "created": 1735576668, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-r1-basic", + "created": 1742306746, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-30b-a3b-instruct-2507", + "created": 1753808388, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2", + "created": 1743381121, + "object": "model", + "owned_by": "tvergho-87e44d", + "kind": "HF_PEFT_ADDON", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false + } + } + ], + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json b/tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json new file mode 100644 index 000000000..6b726d9fe --- /dev/null +++ b/tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json @@ -0,0 +1,48 @@ +{ + "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]", + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/completions", + "headers": {}, + "body": { + "model": "Qwen/Qwen3-0.6B", + "prompt": "I am feeling really sad today.", + "stream": false + }, + "endpoint": "/v1/completions", + "model": "Qwen/Qwen3-0.6B" + }, + "response": { + "body": { + "__type__": "openai.types.completion.Completion", + "__data__": { + "id": "rec-d2ba309413e8", + "choices": [ + { + "finish_reason": "length", + "index": 0, + "logprobs": null, + "text": " I have been working on a project that I feel like I'm not doing well", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "created": 0, + "model": "Qwen/Qwen3-0.6B", + "object": "text_completion", + "system_fingerprint": null, + "usage": { + "completion_tokens": 16, + "prompt_tokens": 7, + "total_tokens": 23, + "completion_tokens_details": null, + "prompt_tokens_details": null + }, + "service_tier": null, + "kv_transfer_params": null + } + }, + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json b/tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json new file mode 100644 index 000000000..21cc0300f --- /dev/null +++ b/tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json @@ -0,0 +1,54 @@ +{ + "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]", + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/completions", + "headers": {}, + "body": { + "model": "Qwen/Qwen3-0.6B", + "prompt": "I am feeling really sad today.", + "stream": false, + "extra_body": { + "guided_choices": [ + "joy", + "sadness" + ] + } + }, + "endpoint": "/v1/completions", + "model": "Qwen/Qwen3-0.6B" + }, + "response": { + "body": { + "__type__": "openai.types.completion.Completion", + "__data__": { + "id": "rec-e3727f6c749a", + "choices": [ + { + "finish_reason": "length", + "index": 0, + "logprobs": null, + "text": " I feel that I am not good enough, and I feel like I have no", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "created": 0, + "model": "Qwen/Qwen3-0.6B", + "object": "text_completion", + "system_fingerprint": null, + "usage": { + "completion_tokens": 16, + "prompt_tokens": 7, + "total_tokens": 23, + "completion_tokens_details": null, + "prompt_tokens_details": null + }, + "service_tier": null, + "kv_transfer_params": null + } + }, + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json b/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json new file mode 100644 index 000000000..8a54ca1f7 --- /dev/null +++ b/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json @@ -0,0 +1,54 @@ +{ + "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]", + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/completions", + "headers": {}, + "body": { + "model": "Qwen/Qwen3-0.6B", + "prompt": "I am feeling really sad today.", + "stream": false, + "extra_body": { + "guided_choice": [ + "joy", + "sadness" + ] + } + }, + "endpoint": "/v1/completions", + "model": "Qwen/Qwen3-0.6B" + }, + "response": { + "body": { + "__type__": "openai.types.completion.Completion", + "__data__": { + "id": "rec-f02f1bfd75ad", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "text": "sadness", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "created": 0, + "model": "Qwen/Qwen3-0.6B", + "object": "text_completion", + "system_fingerprint": null, + "usage": { + "completion_tokens": 3, + "prompt_tokens": 7, + "total_tokens": 10, + "completion_tokens_details": null, + "prompt_tokens_details": null + }, + "service_tier": null, + "kv_transfer_params": null + } + }, + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 2c065560e..3f0cffb2d 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -223,7 +223,7 @@ def test_openai_completion_guided_choice(llama_stack_client, client_with_models, model=text_model_id, prompt=prompt, stream=False, - guided_choice=["joy", "sadness"], + extra_body={"guided_choice": ["joy", "sadness"]}, ) assert len(response.choices) > 0 choice = response.choices[0] diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index 569fb5031..2312e36a5 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -15,9 +15,14 @@ from llama_stack.apis.inference import ( OpenAIChatCompletion, OpenAIChatCompletionRequest, OpenAIChoice, + OpenAICompletion, + OpenAICompletionChoice, + OpenAICompletionRequest, ToolChoice, ) from llama_stack.apis.models import Model +from llama_stack.core.routers.inference import InferenceRouter +from llama_stack.core.routing_tables.models import ModelsRoutingTable from llama_stack.providers.datatypes import HealthStatus from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig from llama_stack.providers.remote.inference.vllm.vllm import VLLMInferenceAdapter @@ -191,3 +196,148 @@ async def test_openai_chat_completion_is_async(vllm_inference_adapter): assert mock_create_client.call_count == 4 # no cheating assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max" + + +async def test_vllm_completion_extra_body(): + """ + Test that vLLM-specific guided_choice and prompt_logprobs parameters are correctly forwarded + via extra_body to the underlying OpenAI client through the InferenceRouter. + """ + # Set up the vLLM adapter + config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + vllm_adapter = VLLMInferenceAdapter(config=config) + vllm_adapter.__provider_id__ = "vllm" + await vllm_adapter.initialize() + + # Create a mock model store + mock_model_store = AsyncMock() + mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm") + mock_model_store.get_model.return_value = mock_model + mock_model_store.has_model.return_value = True + + # Create a mock dist_registry + mock_dist_registry = MagicMock() + mock_dist_registry.get = AsyncMock(return_value=mock_model) + mock_dist_registry.set = AsyncMock() + + # Set up the routing table + routing_table = ModelsRoutingTable( + impls_by_provider_id={"vllm": vllm_adapter}, + dist_registry=mock_dist_registry, + policy=[], + ) + # Inject the model store into the adapter + vllm_adapter.model_store = routing_table + + # Create the InferenceRouter + router = InferenceRouter(routing_table=routing_table) + + # Patch the OpenAI client + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_client_property: + mock_client = MagicMock() + mock_client.completions.create = AsyncMock( + return_value=OpenAICompletion( + id="cmpl-abc123", + created=1, + model="mock-model", + choices=[ + OpenAICompletionChoice( + text="joy", + finish_reason="stop", + index=0, + ) + ], + ) + ) + mock_client_property.return_value = mock_client + + # Test with guided_choice and prompt_logprobs as extra fields + params = OpenAICompletionRequest( + model="mock-model", + prompt="I am feeling happy", + stream=False, + guided_choice=["joy", "sadness"], + prompt_logprobs=5, + ) + await router.openai_completion(params) + + # Verify that the client was called with extra_body containing both parameters + mock_client.completions.create.assert_called_once() + call_kwargs = mock_client.completions.create.call_args.kwargs + assert "extra_body" in call_kwargs + assert "guided_choice" in call_kwargs["extra_body"] + assert call_kwargs["extra_body"]["guided_choice"] == ["joy", "sadness"] + assert "prompt_logprobs" in call_kwargs["extra_body"] + assert call_kwargs["extra_body"]["prompt_logprobs"] == 5 + + +async def test_vllm_chat_completion_extra_body(): + """ + Test that vLLM-specific parameters (e.g., chat_template_kwargs) are correctly forwarded + via extra_body to the underlying OpenAI client through the InferenceRouter for chat completion. + """ + # Set up the vLLM adapter + config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + vllm_adapter = VLLMInferenceAdapter(config=config) + vllm_adapter.__provider_id__ = "vllm" + await vllm_adapter.initialize() + + # Create a mock model store + mock_model_store = AsyncMock() + mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm") + mock_model_store.get_model.return_value = mock_model + mock_model_store.has_model.return_value = True + + # Create a mock dist_registry + mock_dist_registry = MagicMock() + mock_dist_registry.get = AsyncMock(return_value=mock_model) + mock_dist_registry.set = AsyncMock() + + # Set up the routing table + routing_table = ModelsRoutingTable( + impls_by_provider_id={"vllm": vllm_adapter}, + dist_registry=mock_dist_registry, + policy=[], + ) + # Inject the model store into the adapter + vllm_adapter.model_store = routing_table + + # Create the InferenceRouter + router = InferenceRouter(routing_table=routing_table) + + # Patch the OpenAI client + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_client_property: + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock( + return_value=OpenAIChatCompletion( + id="chatcmpl-abc123", + created=1, + model="mock-model", + choices=[ + OpenAIChoice( + message=OpenAIAssistantMessageParam( + content="test response", + ), + finish_reason="stop", + index=0, + ) + ], + ) + ) + mock_client_property.return_value = mock_client + + # Test with chat_template_kwargs as extra field + params = OpenAIChatCompletionRequest( + model="mock-model", + messages=[{"role": "user", "content": "test"}], + stream=False, + chat_template_kwargs={"thinking": True}, + ) + await router.openai_chat_completion(params) + + # Verify that the client was called with extra_body containing chat_template_kwargs + mock_client.chat.completions.create.assert_called_once() + call_kwargs = mock_client.chat.completions.create.call_args.kwargs + assert "extra_body" in call_kwargs + assert "chat_template_kwargs" in call_kwargs["extra_body"] + assert call_kwargs["extra_body"]["chat_template_kwargs"] == {"thinking": True}