From 554ada57b03c2dc38d7039f4dcef0bea8840ebd6 Mon Sep 17 00:00:00 2001 From: Francisco Arceo Date: Fri, 13 Jun 2025 12:28:51 -0600 Subject: [PATCH] chore: Add OpenAI compatibility for Ollama embeddings (#2440) # What does this PR do? This PR adds OpenAI compatibility for Ollama embeddings. Closes https://github.com/meta-llama/llama-stack/issues/2428 Summary of changes: - `llama_stack/providers/remote/inference/ollama/ollama.py` - Implements the OpenAI embeddings endpoint for Ollama, replacing the NotImplementedError with a full function that validates the model, prepares parameters, calls the client, encodes embedding data (optionally in base64), and returns a correctly structured response. - Updates import statements to include the new embedding response utilities. - `llama_stack/providers/utils/inference/litellm_openai_mixin.py` - Refactors the embedding data encoding logic to use a new shared utility (`b64_encode_openai_embeddings_response`) instead of inline base64 encoding and packing logic. - Cleans up imports accordingly. - `llama_stack/providers/utils/inference/openai_compat.py` - Adds `b64_encode_openai_embeddings_response` to handle encoding OpenAI embedding outputs (including base64 support) in a reusable way. - Adds `prepare_openai_embeddings_params` utility for standardizing embedding parameter preparation. - Updates imports to include the new embedding data class. - `tests/integration/inference/test_openai_embeddings.py` - Removes `"remote::ollama"` from the list of providers that skip OpenAI embeddings tests, since support is now implemented. ## Note There was one minor issue, which required me to override the `OpenAIEmbeddingsResponse.model` name with `self._get_model(model).identifier` name, which is very unsatisfying. ## Test Plan Unit Tests and integration tests --------- Signed-off-by: Francisco Javier Arceo --- .../remote/inference/ollama/ollama.py | 35 +++++++++++- .../utils/inference/litellm_openai_mixin.py | 15 +---- .../utils/inference/openai_compat.py | 55 +++++++++++++++++++ .../inference/test_openai_embeddings.py | 1 - 4 files changed, 90 insertions(+), 16 deletions(-) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 358a29d4c..f49348c27 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -33,7 +33,6 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, LogProbConfig, Message, - OpenAIEmbeddingsResponse, ResponseFormat, SamplingParams, TextTruncation, @@ -46,6 +45,8 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, + OpenAIEmbeddingsResponse, + OpenAIEmbeddingUsage, OpenAIMessageParam, OpenAIResponseFormatParam, ) @@ -62,8 +63,10 @@ from llama_stack.providers.utils.inference.model_registry import ( from llama_stack.providers.utils.inference.openai_compat import ( OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, + b64_encode_openai_embeddings_response, get_sampling_options, prepare_openai_completion_params, + prepare_openai_embeddings_params, process_chat_completion_response, process_chat_completion_stream_response, process_completion_response, @@ -386,7 +389,35 @@ class OllamaInferenceAdapter( dimensions: int | None = None, user: str | None = None, ) -> OpenAIEmbeddingsResponse: - raise NotImplementedError() + model_obj = await self._get_model(model) + if model_obj.model_type != ModelType.embedding: + raise ValueError(f"Model {model} is not an embedding model") + + if model_obj.provider_resource_id is None: + raise ValueError(f"Model {model} has no provider_resource_id set") + + # Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters + params = prepare_openai_embeddings_params( + model=model_obj.provider_resource_id, + input=input, + encoding_format=encoding_format, + dimensions=dimensions, + user=user, + ) + + response = await self.openai_client.embeddings.create(**params) + data = b64_encode_openai_embeddings_response(response.data, encoding_format) + + usage = OpenAIEmbeddingUsage( + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + ) + # TODO: Investigate why model_obj.identifier is used instead of response.model + return OpenAIEmbeddingsResponse( + data=data, + model=model_obj.identifier, + usage=usage, + ) async def openai_completion( self, diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index dab10bc55..13381f3c9 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -4,8 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import base64 -import struct from collections.abc import AsyncGenerator, AsyncIterator from typing import Any @@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, - OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, @@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( + b64_encode_openai_embeddings_response, convert_message_to_openai_dict_new, convert_openai_chat_completion_choice, convert_openai_chat_completion_stream, @@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin( ) # Convert response to OpenAI format - data = [] - for i, embedding_data in enumerate(response["data"]): - # we encode to base64 if the encoding format is base64 in the request - if encoding_format == "base64": - byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"]) - embedding = base64.b64encode(byte_data).decode("utf-8") - else: - embedding = embedding_data["embedding"] - - data.append(OpenAIEmbeddingData(embedding=embedding, index=i)) + data = b64_encode_openai_embeddings_response(response.data, encoding_format) usage = OpenAIEmbeddingUsage( prompt_tokens=response["usage"]["prompt_tokens"], diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 049f06fdb..5f0f7fa58 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -3,8 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import base64 import json import logging +import struct import time import uuid import warnings @@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice, + OpenAIEmbeddingData, OpenAIMessageParam, OpenAIResponseFormatParam, ToolConfig, @@ -1483,3 +1486,55 @@ class OpenAIChatCompletionToLlamaStackMixin: model=model, object="chat.completion", ) + + +def prepare_openai_embeddings_params( + model: str, + input: str | list[str], + encoding_format: str | None = "float", + dimensions: int | None = None, + user: str | None = None, +): + if model is None: + raise ValueError("Model must be provided for embeddings") + + input_list = [input] if isinstance(input, str) else input + + params: dict[str, Any] = { + "model": model, + "input": input_list, + } + + if encoding_format is not None: + params["encoding_format"] = encoding_format + if dimensions is not None: + params["dimensions"] = dimensions + if user is not None: + params["user"] = user + + return params + + +def b64_encode_openai_embeddings_response( + response_data: dict, encoding_format: str | None = "float" +) -> list[OpenAIEmbeddingData]: + """ + Process the OpenAI embeddings response to encode the embeddings in base64 format if specified. + """ + data = [] + for i, embedding_data in enumerate(response_data): + if encoding_format == "base64": + byte_array = bytearray() + for embedding_value in embedding_data.embedding: + byte_array.extend(struct.pack("f", float(embedding_value))) + + response_embedding = base64.b64encode(byte_array).decode("utf-8") + else: + response_embedding = embedding_data.embedding + data.append( + OpenAIEmbeddingData( + embedding=response_embedding, + index=i, + ) + ) + return data diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py index 90a91a206..1b8bd9038 100644 --- a/tests/integration/inference/test_openai_embeddings.py +++ b/tests/integration/inference/test_openai_embeddings.py @@ -51,7 +51,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id): "remote::runpod", "remote::sambanova", "remote::tgi", - "remote::ollama", ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")