From 6f1a935365e8d176e7ad6fd05a17cf3e9667db88 Mon Sep 17 00:00:00 2001 From: Varsha Date: Mon, 16 Jun 2025 16:06:05 -0700 Subject: [PATCH] chore: Add OpenAI compatiblity for vLLM embeddings (#2448) # What does this PR do? - Implement OpenAI-compatible embeddings endpoint in vLLM provider - Support both float and base64 encoding formats - Add proper error handling and response formatting Closes #2447 ## Test Plan Signed-off-by: Varsha Prasad Narsing --- .../providers/remote/inference/vllm/vllm.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 16d133c81..3424be6b4 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -38,7 +38,9 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, LogProbConfig, Message, + OpenAIEmbeddingData, OpenAIEmbeddingsResponse, + OpenAIEmbeddingUsage, ResponseFormat, SamplingParams, TextTruncation, @@ -536,7 +538,39 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): dimensions: int | None = None, user: str | None = None, ) -> OpenAIEmbeddingsResponse: - raise NotImplementedError() + self._lazy_initialize_client() + assert self.client is not None + model_obj = await self._get_model(model) + assert model_obj.model_type == ModelType.embedding + + # Convert input to list if it's a string + input_list = [input] if isinstance(input, str) else input + + # Call vLLM embeddings endpoint with encoding_format + response = await self.client.embeddings.create( + model=model_obj.provider_resource_id, + input=input_list, + dimensions=dimensions, + encoding_format=encoding_format, + ) + + # Convert response to OpenAI format + data = [ + OpenAIEmbeddingData( + embedding=embedding_data.embedding, + index=i, + ) + for i, embedding_data in enumerate(response.data) + ] + + # Not returning actual token usage since vLLM doesn't provide it + usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1) + + return OpenAIEmbeddingsResponse( + data=data, + model=model_obj.provider_resource_id, + usage=usage, + ) async def openai_completion( self,