diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 9c1c3170f..770abfb27 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -3607,6 +3607,49 @@
}
}
},
+ "/v1/openai/v1/embeddings": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "An OpenAIEmbeddingsResponse containing the embeddings.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/OpenAIEmbeddingsResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Inference"
+ ],
+ "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/OpenaiEmbeddingsRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/openai/v1/models": {
"get": {
"responses": {
@@ -11767,6 +11810,139 @@
"title": "OpenAICompletionChoice",
"description": "A choice from an OpenAI-compatible completion response."
},
+ "OpenaiEmbeddingsRequest": {
+ "type": "object",
+ "properties": {
+ "model": {
+ "type": "string",
+ "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+ },
+ "input": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ],
+ "description": "Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings."
+ },
+ "encoding_format": {
+ "type": "string",
+ "description": "(Optional) The format to return the embeddings in. Can be either \"float\" or \"base64\". Defaults to \"float\"."
+ },
+ "dimensions": {
+ "type": "integer",
+ "description": "(Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models."
+ },
+ "user": {
+ "type": "string",
+ "description": "(Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model",
+ "input"
+ ],
+ "title": "OpenaiEmbeddingsRequest"
+ },
+ "OpenAIEmbeddingData": {
+ "type": "object",
+ "properties": {
+ "object": {
+ "type": "string",
+ "const": "embedding",
+ "default": "embedding",
+ "description": "The object type, which will be \"embedding\""
+ },
+ "embedding": {
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ {
+ "type": "string"
+ }
+ ],
+ "description": "The embedding vector as a list of floats (when encoding_format=\"float\") or as a base64-encoded string (when encoding_format=\"base64\")"
+ },
+ "index": {
+ "type": "integer",
+ "description": "The index of the embedding in the input list"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "object",
+ "embedding",
+ "index"
+ ],
+ "title": "OpenAIEmbeddingData",
+ "description": "A single embedding data object from an OpenAI-compatible embeddings response."
+ },
+ "OpenAIEmbeddingUsage": {
+ "type": "object",
+ "properties": {
+ "prompt_tokens": {
+ "type": "integer",
+ "description": "The number of tokens in the input"
+ },
+ "total_tokens": {
+ "type": "integer",
+ "description": "The total number of tokens used"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "prompt_tokens",
+ "total_tokens"
+ ],
+ "title": "OpenAIEmbeddingUsage",
+ "description": "Usage information for an OpenAI-compatible embeddings response."
+ },
+ "OpenAIEmbeddingsResponse": {
+ "type": "object",
+ "properties": {
+ "object": {
+ "type": "string",
+ "const": "list",
+ "default": "list",
+ "description": "The object type, which will be \"list\""
+ },
+ "data": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/OpenAIEmbeddingData"
+ },
+ "description": "List of embedding data objects"
+ },
+ "model": {
+ "type": "string",
+ "description": "The model that was used to generate the embeddings"
+ },
+ "usage": {
+ "$ref": "#/components/schemas/OpenAIEmbeddingUsage",
+ "description": "Usage information"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "object",
+ "data",
+ "model",
+ "usage"
+ ],
+ "title": "OpenAIEmbeddingsResponse",
+ "description": "Response from an OpenAI-compatible embeddings request."
+ },
"OpenAIModel": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1afe870cf..15842ff19 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -2520,6 +2520,38 @@ paths:
schema:
$ref: '#/components/schemas/OpenaiCompletionRequest'
required: true
+ /v1/openai/v1/embeddings:
+ post:
+ responses:
+ '200':
+ description: >-
+ An OpenAIEmbeddingsResponse containing the embeddings.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/OpenAIEmbeddingsResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Inference
+ description: >-
+ Generate OpenAI-compatible embeddings for the given input using the specified
+ model.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/OpenaiEmbeddingsRequest'
+ required: true
/v1/openai/v1/models:
get:
responses:
@@ -8177,6 +8209,118 @@ components:
title: OpenAICompletionChoice
description: >-
A choice from an OpenAI-compatible completion response.
+ OpenaiEmbeddingsRequest:
+ type: object
+ properties:
+ model:
+ type: string
+ description: >-
+ The identifier of the model to use. The model must be an embedding model
+ registered with Llama Stack and available via the /models endpoint.
+ input:
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ type: string
+ description: >-
+ Input text to embed, encoded as a string or array of strings. To embed
+ multiple inputs in a single request, pass an array of strings.
+ encoding_format:
+ type: string
+ description: >-
+ (Optional) The format to return the embeddings in. Can be either "float"
+ or "base64". Defaults to "float".
+ dimensions:
+ type: integer
+ description: >-
+ (Optional) The number of dimensions the resulting output embeddings should
+ have. Only supported in text-embedding-3 and later models.
+ user:
+ type: string
+ description: >-
+ (Optional) A unique identifier representing your end-user, which can help
+ OpenAI to monitor and detect abuse.
+ additionalProperties: false
+ required:
+ - model
+ - input
+ title: OpenaiEmbeddingsRequest
+ OpenAIEmbeddingData:
+ type: object
+ properties:
+ object:
+ type: string
+ const: embedding
+ default: embedding
+ description: >-
+ The object type, which will be "embedding"
+ embedding:
+ oneOf:
+ - type: array
+ items:
+ type: number
+ - type: string
+ description: >-
+ The embedding vector as a list of floats (when encoding_format="float")
+ or as a base64-encoded string (when encoding_format="base64")
+ index:
+ type: integer
+ description: >-
+ The index of the embedding in the input list
+ additionalProperties: false
+ required:
+ - object
+ - embedding
+ - index
+ title: OpenAIEmbeddingData
+ description: >-
+ A single embedding data object from an OpenAI-compatible embeddings response.
+ OpenAIEmbeddingUsage:
+ type: object
+ properties:
+ prompt_tokens:
+ type: integer
+ description: The number of tokens in the input
+ total_tokens:
+ type: integer
+ description: The total number of tokens used
+ additionalProperties: false
+ required:
+ - prompt_tokens
+ - total_tokens
+ title: OpenAIEmbeddingUsage
+ description: >-
+ Usage information for an OpenAI-compatible embeddings response.
+ OpenAIEmbeddingsResponse:
+ type: object
+ properties:
+ object:
+ type: string
+ const: list
+ default: list
+ description: The object type, which will be "list"
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/OpenAIEmbeddingData'
+ description: List of embedding data objects
+ model:
+ type: string
+ description: >-
+ The model that was used to generate the embeddings
+ usage:
+ $ref: '#/components/schemas/OpenAIEmbeddingUsage'
+ description: Usage information
+ additionalProperties: false
+ required:
+ - object
+ - data
+ - model
+ - usage
+ title: OpenAIEmbeddingsResponse
+ description: >-
+ Response from an OpenAI-compatible embeddings request.
OpenAIModel:
type: object
properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index e79dc6d94..74697dd18 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -783,6 +783,48 @@ class OpenAICompletion(BaseModel):
object: Literal["text_completion"] = "text_completion"
+@json_schema_type
+class OpenAIEmbeddingData(BaseModel):
+ """A single embedding data object from an OpenAI-compatible embeddings response.
+
+ :param object: The object type, which will be "embedding"
+ :param embedding: The embedding vector as a list of floats (when encoding_format="float") or as a base64-encoded string (when encoding_format="base64")
+ :param index: The index of the embedding in the input list
+ """
+
+ object: Literal["embedding"] = "embedding"
+ embedding: list[float] | str
+ index: int
+
+
+@json_schema_type
+class OpenAIEmbeddingUsage(BaseModel):
+ """Usage information for an OpenAI-compatible embeddings response.
+
+ :param prompt_tokens: The number of tokens in the input
+ :param total_tokens: The total number of tokens used
+ """
+
+ prompt_tokens: int
+ total_tokens: int
+
+
+@json_schema_type
+class OpenAIEmbeddingsResponse(BaseModel):
+ """Response from an OpenAI-compatible embeddings request.
+
+ :param object: The object type, which will be "list"
+ :param data: List of embedding data objects
+ :param model: The model that was used to generate the embeddings
+ :param usage: Usage information
+ """
+
+ object: Literal["list"] = "list"
+ data: list[OpenAIEmbeddingData]
+ model: str
+ usage: OpenAIEmbeddingUsage
+
+
class ModelStore(Protocol):
async def get_model(self, identifier: str) -> Model: ...
@@ -1076,6 +1118,26 @@ class InferenceProvider(Protocol):
"""
...
+ @webmethod(route="/openai/v1/embeddings", method="POST")
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ """Generate OpenAI-compatible embeddings for the given input using the specified model.
+
+ :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+ :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
+ :param encoding_format: (Optional) The format to return the embeddings in. Can be either "float" or "base64". Defaults to "float".
+ :param dimensions: (Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
+ :param user: (Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
+ :returns: An OpenAIEmbeddingsResponse containing the embeddings.
+ """
+ ...
+
class Inference(InferenceProvider):
"""Llama Stack Inference API for generating completions, chat completions, and embeddings.
diff --git a/llama_stack/distribution/routers/inference.py b/llama_stack/distribution/routers/inference.py
index f77b19302..763bd9105 100644
--- a/llama_stack/distribution/routers/inference.py
+++ b/llama_stack/distribution/routers/inference.py
@@ -45,6 +45,7 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
+ OpenAIEmbeddingsResponse,
OpenAIMessageParam,
OpenAIResponseFormatParam,
)
@@ -546,6 +547,34 @@ class InferenceRouter(Inference):
await self.store.store_chat_completion(response, messages)
return response
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ logger.debug(
+ f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
+ )
+ model_obj = await self.routing_table.get_model(model)
+ if model_obj is None:
+ raise ValueError(f"Model '{model}' not found")
+ if model_obj.model_type != ModelType.embedding:
+ raise ValueError(f"Model '{model}' is not an embedding model")
+
+ params = dict(
+ model=model_obj.identifier,
+ input=input,
+ encoding_format=encoding_format,
+ dimensions=dimensions,
+ user=user,
+ )
+
+ provider = self.routing_table.get_provider_impl(model_obj.identifier)
+ return await provider.openai_embeddings(**params)
+
async def list_chat_completions(
self,
after: str | None = None,
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 438cb14a0..bf54462b5 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -40,6 +40,7 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -410,6 +411,16 @@ class VLLMInferenceImpl(
) -> EmbeddingsResponse:
raise NotImplementedError()
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def chat_completion(
self,
model_id: str,
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 0404a578f..952d86f1a 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -22,6 +22,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -197,3 +198,13 @@ class BedrockInferenceAdapter(
response_body = json.loads(response.get("body").read())
embeddings.append(response_body.get("embedding"))
return EmbeddingsResponse(embeddings=embeddings)
+
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 685375346..952118e24 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -21,6 +21,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -194,3 +195,13 @@ class CerebrasInferenceAdapter(
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
+
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 5c36eac3e..1dc18b97f 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -20,6 +20,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -152,3 +153,13 @@ class DatabricksInferenceAdapter(
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
+
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index b6d3984c6..fe21685dd 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -37,6 +37,7 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
+ OpenAIEmbeddingsResponse,
OpenAIMessageParam,
OpenAIResponseFormatParam,
)
@@ -286,6 +287,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings)
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 333486fe4..4c68322e0 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -29,6 +29,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -238,6 +239,16 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
#
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def chat_completion(
self,
model_id: str,
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 3b4287673..8863e0edc 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -32,6 +32,7 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -370,6 +371,16 @@ class OllamaInferenceAdapter(
return model
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index c3c25edd3..6f3a686a8 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -14,6 +14,9 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
+ OpenAIEmbeddingData,
+ OpenAIEmbeddingsResponse,
+ OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
)
@@ -38,6 +41,7 @@ logger = logging.getLogger(__name__)
# | batch_chat_completion | LiteLLMOpenAIMixin |
# | openai_completion | AsyncOpenAI |
# | openai_chat_completion | AsyncOpenAI |
+# | openai_embeddings | AsyncOpenAI |
#
class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
def __init__(self, config: OpenAIConfig) -> None:
@@ -171,3 +175,51 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
user=user,
)
return await self._openai_client.chat.completions.create(**params)
+
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ model_id = (await self.model_store.get_model(model)).provider_resource_id
+ if model_id.startswith("openai/"):
+ model_id = model_id[len("openai/") :]
+
+ # Prepare parameters for OpenAI embeddings API
+ params = {
+ "model": model_id,
+ "input": input,
+ }
+
+ if encoding_format is not None:
+ params["encoding_format"] = encoding_format
+ if dimensions is not None:
+ params["dimensions"] = dimensions
+ if user is not None:
+ params["user"] = user
+
+ # Call OpenAI embeddings API
+ response = await self._openai_client.embeddings.create(**params)
+
+ data = []
+ for i, embedding_data in enumerate(response.data):
+ data.append(
+ OpenAIEmbeddingData(
+ embedding=embedding_data.embedding,
+ index=i,
+ )
+ )
+
+ usage = OpenAIEmbeddingUsage(
+ prompt_tokens=response.usage.prompt_tokens,
+ total_tokens=response.usage.total_tokens,
+ )
+
+ return OpenAIEmbeddingsResponse(
+ data=data,
+ model=response.model,
+ usage=usage,
+ )
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index 78ee52641..6cf4680e2 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -19,6 +19,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -210,6 +211,16 @@ class PassthroughInferenceAdapter(Inference):
task_type=task_type,
)
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index 2706aa15e..f8c98893e 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -8,6 +8,7 @@ from collections.abc import AsyncGenerator
from openai import OpenAI
from llama_stack.apis.inference import * # noqa: F403
+from llama_stack.apis.inference.inference import OpenAIEmbeddingsResponse
# from llama_stack.providers.datatypes import ModelsProtocolPrivate
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
@@ -134,3 +135,13 @@ class RunpodInferenceAdapter(
task_type: Optional[EmbeddingTaskType] = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
+
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 8f6666462..292d74ef8 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
ResponseFormatType,
SamplingParams,
@@ -291,6 +292,16 @@ class _HfAdapter(
) -> EmbeddingsResponse:
raise NotImplementedError()
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
class TGIAdapter(_HfAdapter):
async def initialize(self, config: TGIImplConfig) -> None:
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 562e6e0ff..7305a638d 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
ResponseFormatType,
SamplingParams,
@@ -267,6 +268,16 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
embeddings = [item.embedding for item in r.data]
return EmbeddingsResponse(embeddings=embeddings)
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index fe2d8bec1..9f38d9abf 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -38,6 +38,7 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -507,6 +508,16 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings)
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index c1299e11f..59f5f5562 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -21,6 +21,7 @@ from llama_stack.apis.inference import (
Inference,
LogProbConfig,
Message,
+ OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -260,6 +261,16 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
) -> EmbeddingsResponse:
raise NotImplementedError("embedding is not supported for watsonx")
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ raise NotImplementedError()
+
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py
index 7c8144c62..97cf87360 100644
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@@ -4,7 +4,9 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+import base64
import logging
+import struct
from typing import TYPE_CHECKING
if TYPE_CHECKING:
@@ -15,6 +17,9 @@ from llama_stack.apis.inference import (
EmbeddingTaskType,
InterleavedContentItem,
ModelStore,
+ OpenAIEmbeddingData,
+ OpenAIEmbeddingsResponse,
+ OpenAIEmbeddingUsage,
TextTruncation,
)
from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
@@ -43,6 +48,50 @@ class SentenceTransformerEmbeddingMixin:
)
return EmbeddingsResponse(embeddings=embeddings)
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ # Convert input to list format if it's a single string
+ input_list = [input] if isinstance(input, str) else input
+ if not input_list:
+ raise ValueError("Empty list not supported")
+
+ # Get the model and generate embeddings
+ model_obj = await self.model_store.get_model(model)
+ embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
+ embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+
+ # Convert embeddings to the requested format
+ data = []
+ for i, embedding in enumerate(embeddings):
+ if encoding_format == "base64":
+ # Convert float array to base64 string
+ float_bytes = struct.pack(f"{len(embedding)}f", *embedding)
+ embedding_value = base64.b64encode(float_bytes).decode("ascii")
+ else:
+ # Default to float format
+ embedding_value = embedding.tolist()
+
+ data.append(
+ OpenAIEmbeddingData(
+ embedding=embedding_value,
+ index=i,
+ )
+ )
+
+ # Not returning actual token usage
+ usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
+ return OpenAIEmbeddingsResponse(
+ data=data,
+ model=model_obj.provider_resource_id,
+ usage=usage,
+ )
+
def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
global EMBEDDING_MODELS
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index 4d17db21e..dab10bc55 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -4,6 +4,8 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+import base64
+import struct
from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any
@@ -35,6 +37,9 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
+ OpenAIEmbeddingData,
+ OpenAIEmbeddingsResponse,
+ OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
)
@@ -264,6 +269,52 @@ class LiteLLMOpenAIMixin(
embeddings = [data["embedding"] for data in response["data"]]
return EmbeddingsResponse(embeddings=embeddings)
+ async def openai_embeddings(
+ self,
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+ ) -> OpenAIEmbeddingsResponse:
+ model_obj = await self.model_store.get_model(model)
+
+ # Convert input to list if it's a string
+ input_list = [input] if isinstance(input, str) else input
+
+ # Call litellm embedding function
+ # litellm.drop_params = True
+ response = litellm.embedding(
+ model=self.get_litellm_model_name(model_obj.provider_resource_id),
+ input=input_list,
+ api_key=self.get_api_key(),
+ api_base=self.api_base,
+ dimensions=dimensions,
+ )
+
+ # Convert response to OpenAI format
+ data = []
+ for i, embedding_data in enumerate(response["data"]):
+ # we encode to base64 if the encoding format is base64 in the request
+ if encoding_format == "base64":
+ byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
+ embedding = base64.b64encode(byte_data).decode("utf-8")
+ else:
+ embedding = embedding_data["embedding"]
+
+ data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
+
+ usage = OpenAIEmbeddingUsage(
+ prompt_tokens=response["usage"]["prompt_tokens"],
+ total_tokens=response["usage"]["total_tokens"],
+ )
+
+ return OpenAIEmbeddingsResponse(
+ data=data,
+ model=model_obj.provider_resource_id,
+ usage=usage,
+ )
+
async def openai_completion(
self,
model: str,