diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
index 9e28e0f42..7845fb068 100644
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@@ -1035,50 +1035,6 @@
]
}
},
- "/v1/inference/embeddings": {
- "post": {
- "responses": {
- "200": {
- "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EmbeddingsResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Inference"
- ],
- "summary": "Generate embeddings for content pieces using the specified model.",
- "description": "Generate embeddings for content pieces using the specified model.",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EmbeddingsRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": {
"post": {
"responses": {
@@ -10547,80 +10503,6 @@
"title": "OpenAIDeleteResponseObject",
"description": "Response object confirming deletion of an OpenAI response."
},
- "EmbeddingsRequest": {
- "type": "object",
- "properties": {
- "model_id": {
- "type": "string",
- "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
- },
- "contents": {
- "oneOf": [
- {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/InterleavedContentItem"
- }
- }
- ],
- "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
- },
- "text_truncation": {
- "type": "string",
- "enum": [
- "none",
- "start",
- "end"
- ],
- "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
- },
- "output_dimension": {
- "type": "integer",
- "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
- },
- "task_type": {
- "type": "string",
- "enum": [
- "query",
- "document"
- ],
- "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
- }
- },
- "additionalProperties": false,
- "required": [
- "model_id",
- "contents"
- ],
- "title": "EmbeddingsRequest"
- },
- "EmbeddingsResponse": {
- "type": "object",
- "properties": {
- "embeddings": {
- "type": "array",
- "items": {
- "type": "array",
- "items": {
- "type": "number"
- }
- },
- "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
- }
- },
- "additionalProperties": false,
- "required": [
- "embeddings"
- ],
- "title": "EmbeddingsResponse",
- "description": "Response containing generated embeddings."
- },
"AgentCandidate": {
"type": "object",
"properties": {
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 1c06c74a5..8cbbccaa2 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -720,41 +720,6 @@ paths:
required: true
schema:
type: string
- /v1/inference/embeddings:
- post:
- responses:
- '200':
- description: >-
- An array of embeddings, one for each content. Each embedding is a list
- of floats. The dimensionality of the embedding is model-specific; you
- can check model metadata using /models/{model_id}.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EmbeddingsResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Inference
- summary: >-
- Generate embeddings for content pieces using the specified model.
- description: >-
- Generate embeddings for content pieces using the specified model.
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EmbeddingsRequest'
- required: true
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post:
responses:
@@ -7795,72 +7760,6 @@ components:
title: OpenAIDeleteResponseObject
description: >-
Response object confirming deletion of an OpenAI response.
- EmbeddingsRequest:
- type: object
- properties:
- model_id:
- type: string
- description: >-
- The identifier of the model to use. The model must be an embedding model
- registered with Llama Stack and available via the /models endpoint.
- contents:
- oneOf:
- - type: array
- items:
- type: string
- - type: array
- items:
- $ref: '#/components/schemas/InterleavedContentItem'
- description: >-
- List of contents to generate embeddings for. Each content can be a string
- or an InterleavedContentItem (and hence can be multimodal). The behavior
- depends on the model and provider. Some models may only support text.
- text_truncation:
- type: string
- enum:
- - none
- - start
- - end
- description: >-
- (Optional) Config for how to truncate text for embedding when text is
- longer than the model's max sequence length.
- output_dimension:
- type: integer
- description: >-
- (Optional) Output dimensionality for the embeddings. Only supported by
- Matryoshka models.
- task_type:
- type: string
- enum:
- - query
- - document
- description: >-
- (Optional) How is the embedding being used? This is only supported by
- asymmetric embedding models.
- additionalProperties: false
- required:
- - model_id
- - contents
- title: EmbeddingsRequest
- EmbeddingsResponse:
- type: object
- properties:
- embeddings:
- type: array
- items:
- type: array
- items:
- type: number
- description: >-
- List of embedding vectors, one per input content. Each embedding is a
- list of floats. The dimensionality of the embedding is model-specific;
- you can check model metadata using /models/{model_id}
- additionalProperties: false
- required:
- - embeddings
- title: EmbeddingsResponse
- description: >-
- Response containing generated embeddings.
AgentCandidate:
type: object
properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 756896796..c6a4e4f60 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -17,7 +17,7 @@ from typing import (
from pydantic import BaseModel, Field, field_validator
from typing_extensions import TypedDict
-from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem
+from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
from llama_stack.apis.common.responses import Order
from llama_stack.apis.models import Model
from llama_stack.apis.telemetry import MetricResponseMixin
@@ -1070,26 +1070,6 @@ class InferenceProvider(Protocol):
"""
...
- @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- """Generate embeddings for content pieces using the specified model.
-
- :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
- :param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text.
- :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
- :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
- :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
- :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.
- """
- ...
-
@webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
async def rerank(
self,
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index fcf01a9c4..80f47fb5d 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -16,7 +16,6 @@ from pydantic import Field, TypeAdapter
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
from llama_stack.apis.inference import (
@@ -26,8 +25,6 @@ from llama_stack.apis.inference import (
CompletionMessage,
CompletionResponse,
CompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
ListOpenAIChatCompletionResponse,
LogProbConfig,
@@ -48,7 +45,6 @@ from llama_stack.apis.inference import (
ResponseFormat,
SamplingParams,
StopReason,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -312,25 +308,6 @@ class InferenceRouter(Inference):
return response
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- logger.debug(f"InferenceRouter.embeddings: {model_id}")
- await self._get_model(model_id, ModelType.embedding)
- provider = await self.routing_table.get_provider_impl(model_id)
- return await provider.embeddings(
- model_id=model_id,
- contents=contents,
- text_truncation=text_truncation,
- output_dimension=output_dimension,
- task_type=task_type,
- )
-
async def openai_completion(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 29b935bbd..2206aa641 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -11,21 +11,17 @@ from botocore.client import BaseClient
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -47,8 +43,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
)
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
- content_has_media,
- interleaved_content_as_str,
)
from .models import MODEL_ENTRIES
@@ -218,36 +212,6 @@ class BedrockInferenceAdapter(
),
}
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self.model_store.get_model(model_id)
-
- # Convert foundation model ID to inference profile ID
- region_name = self.client.meta.region_name
- inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
-
- embeddings = []
- for content in contents:
- assert not content_has_media(content), "Bedrock does not support media for embeddings"
- input_text = interleaved_content_as_str(content)
- input_body = {"inputText": input_text}
- body = json.dumps(input_body)
- response = self.client.invoke_model(
- body=body,
- modelId=inference_profile_id,
- accept="application/json",
- contentType="application/json",
- )
- response_body = json.loads(response.get("body").read())
- embeddings.append(response_body.get("embedding"))
- return EmbeddingsResponse(embeddings=embeddings)
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 6662f004d..6be39fa5d 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -11,21 +11,17 @@ from cerebras.cloud.sdk import AsyncCerebras
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
CompletionRequest,
CompletionResponse,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -187,16 +183,6 @@ class CerebrasInferenceAdapter(
**get_sampling_options(request.sampling_params),
}
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- raise NotImplementedError()
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 6eac6e4f4..d85b477f5 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -11,15 +11,12 @@ from databricks.sdk import WorkspaceClient
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
CompletionResponse,
CompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
OpenAICompletion,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -118,16 +114,6 @@ class DatabricksInferenceAdapter(
) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
raise NotImplementedError()
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- raise NotImplementedError()
-
async def list_models(self) -> list[Model] | None:
self._model_cache = {} # from OpenAIMixin
ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key()) # TODO: this is not async
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 069a0a674..ed4b56fad 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -10,22 +10,18 @@ from fireworks.client import Fireworks
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
CompletionResponse,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
ResponseFormat,
ResponseFormatType,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -48,8 +44,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
- content_has_media,
- interleaved_content_as_str,
request_has_media,
)
@@ -259,28 +253,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
logger.debug(f"params to fireworks: {params}")
return params
-
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self.model_store.get_model(model_id)
-
- kwargs = {}
- if model.metadata.get("embedding_dimension"):
- kwargs["dimensions"] = model.metadata.get("embedding_dimension")
- assert all(not content_has_media(content) for content in contents), (
- "Fireworks does not support media for embeddings"
- )
- response = self._get_client().embeddings.create(
- model=model.provider_resource_id,
- input=[interleaved_content_as_str(content) for content in contents],
- **kwargs,
- )
-
- embeddings = [data.embedding for data in response.data]
- return EmbeddingsResponse(embeddings=embeddings)
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 92094a0f3..a31981adb 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -11,8 +11,6 @@ from openai import NOT_GIVEN, APIConnectionError
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
- TextContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
@@ -21,8 +19,6 @@ from llama_stack.apis.inference import (
CompletionRequest,
CompletionResponse,
CompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@@ -31,7 +27,6 @@ from llama_stack.apis.inference import (
OpenAIEmbeddingUsage,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
)
@@ -156,60 +151,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
# we pass n=1 to get only one completion
return convert_openai_completion_choice(response.choices[0])
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- if any(content_has_media(content) for content in contents):
- raise NotImplementedError("Media is not supported")
-
- #
- # Llama Stack: contents = list[str] | list[InterleavedContentItem]
- # ->
- # OpenAI: input = str | list[str]
- #
- # we can ignore str and always pass list[str] to OpenAI
- #
- flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
- input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
- provider_model_id = await self._get_provider_model_id(model_id)
-
- extra_body = {}
-
- if text_truncation is not None:
- text_truncation_options = {
- TextTruncation.none: "NONE",
- TextTruncation.end: "END",
- TextTruncation.start: "START",
- }
- extra_body["truncate"] = text_truncation_options[text_truncation]
-
- if output_dimension is not None:
- extra_body["dimensions"] = output_dimension
-
- if task_type is not None:
- task_type_options = {
- EmbeddingTaskType.document: "passage",
- EmbeddingTaskType.query: "query",
- }
- extra_body["input_type"] = task_type_options[task_type]
-
- response = await self.client.embeddings.create(
- model=provider_model_id,
- input=input,
- extra_body=extra_body,
- )
- #
- # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
- # ->
- # Llama Stack: EmbeddingsResponse(embeddings=list[list[float]])
- #
- return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 3fb10445f..16b104fb5 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -14,7 +14,6 @@ from ollama import AsyncClient as AsyncOllamaClient
from llama_stack.apis.common.content_types import (
ImageContentItem,
InterleavedContent,
- InterleavedContentItem,
TextContentItem,
)
from llama_stack.apis.common.errors import UnsupportedModelError
@@ -25,8 +24,6 @@ from llama_stack.apis.inference import (
CompletionRequest,
CompletionResponse,
CompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
GrammarResponseFormat,
InferenceProvider,
JsonSchemaResponseFormat,
@@ -34,7 +31,6 @@ from llama_stack.apis.inference import (
Message,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -66,9 +62,7 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
- content_has_media,
convert_image_content_to_url,
- interleaved_content_as_str,
request_has_media,
)
@@ -363,27 +357,6 @@ class OllamaInferenceAdapter(
async for chunk in process_chat_completion_stream_response(stream, request):
yield chunk
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self._get_model(model_id)
-
- assert all(not content_has_media(content) for content in contents), (
- "Ollama does not support media for embeddings"
- )
- response = await self.ollama_client.embed(
- model=model.provider_resource_id,
- input=[interleaved_content_as_str(content) for content in contents],
- )
- embeddings = response["embeddings"]
-
- return EmbeddingsResponse(embeddings=embeddings)
-
async def register_model(self, model: Model) -> Model:
if await self.check_model_availability(model.provider_model_id):
return model
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index a2bdf0369..ae482b7b0 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -14,8 +14,6 @@ from llama_stack.apis.inference import (
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
CompletionMessage,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@@ -27,7 +25,6 @@ from llama_stack.apis.inference import (
OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -190,25 +187,6 @@ class PassthroughInferenceAdapter(Inference):
chunk = convert_to_pydantic(ChatCompletionResponseStreamChunk, chunk)
yield chunk
- async def embeddings(
- self,
- model_id: str,
- contents: list[InterleavedContent],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- client = self._get_client()
- model = await self.model_store.get_model(model_id)
-
- return await client.inference.embeddings(
- model_id=model.provider_resource_id,
- contents=contents,
- text_truncation=text_truncation,
- output_dimension=output_dimension,
- task_type=task_type,
- )
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index ff2fe6401..82252b04d 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -136,16 +136,6 @@ class RunpodInferenceAdapter(
**get_sampling_options(request.sampling_params),
}
- async def embeddings(
- self,
- model: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- raise NotImplementedError()
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 27597900f..e1632e4a0 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -12,14 +12,11 @@ from pydantic import SecretStr
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
ResponseFormat,
ResponseFormatType,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -306,16 +302,6 @@ class _HfAdapter(
**self._build_options(request.sampling_params, request.response_format),
)
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- raise NotImplementedError()
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index c199677be..083c528bb 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -12,14 +12,11 @@ from together.constants import BASE_URL
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
- EmbeddingsResponse,
- EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
ResponseFormat,
ResponseFormatType,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -50,8 +46,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
- content_has_media,
- interleaved_content_as_str,
request_has_media,
)
@@ -247,26 +241,6 @@ class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Need
logger.debug(f"params to together: {params}")
return params
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self.model_store.get_model(model_id)
- assert all(not content_has_media(content) for content in contents), (
- "Together does not support media for embeddings"
- )
- client = self._get_client()
- r = await client.embeddings.create(
- model=model.provider_resource_id,
- input=[interleaved_content_as_str(content) for content in contents],
- )
- embeddings = [item.embedding for item in r.data]
- return EmbeddingsResponse(embeddings=embeddings)
-
async def list_models(self) -> list[Model] | None:
self._model_cache = {}
# Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 8fbb4b815..bef5cbf2c 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -16,7 +16,6 @@ from openai.types.chat.chat_completion_chunk import (
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
TextDelta,
ToolCallDelta,
ToolCallParseStatus,
@@ -31,8 +30,6 @@ from llama_stack.apis.inference import (
CompletionRequest,
CompletionResponse,
CompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
GrammarResponseFormat,
Inference,
JsonSchemaResponseFormat,
@@ -41,7 +38,6 @@ from llama_stack.apis.inference import (
ModelStore,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -74,8 +70,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
completion_request_to_prompt,
- content_has_media,
- interleaved_content_as_str,
request_has_media,
)
@@ -550,27 +544,3 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
"stream": request.stream,
**options,
}
-
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self._get_model(model_id)
-
- kwargs = {}
- assert model.model_type == ModelType.embedding
- assert model.metadata.get("embedding_dimension")
- kwargs["dimensions"] = model.metadata.get("embedding_dimension")
- assert all(not content_has_media(content) for content in contents), "VLLM does not support media for embeddings"
- response = await self.client.embeddings.create(
- model=model.provider_resource_id,
- input=[interleaved_content_as_str(content) for content in contents],
- **kwargs,
- )
-
- embeddings = [data.embedding for data in response.data]
- return EmbeddingsResponse(embeddings=embeddings)
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index cb8b45565..00b9acc06 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -11,13 +11,11 @@ from ibm_watsonx_ai.foundation_models import Model
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from openai import AsyncOpenAI
-from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
+from llama_stack.apis.common.content_types import InterleavedContent
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
- EmbeddingsResponse,
- EmbeddingTaskType,
GreedySamplingStrategy,
Inference,
LogProbConfig,
@@ -30,7 +28,6 @@ from llama_stack.apis.inference import (
OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -265,16 +262,6 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
}
return params
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- raise NotImplementedError("embedding is not supported for watsonx")
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py
index 9bd0aa8ce..facc59f65 100644
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@@ -15,16 +15,11 @@ if TYPE_CHECKING:
from sentence_transformers import SentenceTransformer
from llama_stack.apis.inference import (
- EmbeddingsResponse,
- EmbeddingTaskType,
- InterleavedContentItem,
ModelStore,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
- TextTruncation,
)
-from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
EMBEDDING_MODELS = {}
@@ -35,23 +30,6 @@ log = get_logger(name=__name__, category="providers::utils")
class SentenceTransformerEmbeddingMixin:
model_store: ModelStore
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self.model_store.get_model(model_id)
- embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
- embeddings = await asyncio.to_thread(
- embedding_model.encode,
- [interleaved_content_as_str(content) for content in contents],
- show_progress_bar=False,
- )
- return EmbeddingsResponse(embeddings=embeddings)
-
async def openai_embeddings(
self,
model: str,
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index b1e38f323..966081e9f 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -11,14 +11,11 @@ import litellm
from llama_stack.apis.common.content_types import (
InterleavedContent,
- InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
- EmbeddingsResponse,
- EmbeddingTaskType,
InferenceProvider,
JsonSchemaResponseFormat,
LogProbConfig,
@@ -32,7 +29,6 @@ from llama_stack.apis.inference import (
OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
- TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@@ -50,9 +46,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
prepare_openai_completion_params,
)
-from llama_stack.providers.utils.inference.prompt_adapter import (
- interleaved_content_as_str,
-)
logger = get_logger(name=__name__, category="providers::utils")
@@ -269,24 +262,6 @@ class LiteLLMOpenAIMixin(
)
return api_key
- async def embeddings(
- self,
- model_id: str,
- contents: list[str] | list[InterleavedContentItem],
- text_truncation: TextTruncation | None = TextTruncation.none,
- output_dimension: int | None = None,
- task_type: EmbeddingTaskType | None = None,
- ) -> EmbeddingsResponse:
- model = await self.model_store.get_model(model_id)
-
- response = litellm.embedding(
- model=self.get_litellm_model_name(model.provider_resource_id),
- input=[interleaved_content_as_str(content) for content in contents],
- )
-
- embeddings = [data["embedding"] for data in response["data"]]
- return EmbeddingsResponse(embeddings=embeddings)
-
async def openai_embeddings(
self,
model: str,
diff --git a/tests/unit/providers/vector_io/test_faiss.py b/tests/unit/providers/vector_io/test_faiss.py
index 90108d7a0..9ee5c82f4 100644
--- a/tests/unit/providers/vector_io/test_faiss.py
+++ b/tests/unit/providers/vector_io/test_faiss.py
@@ -5,13 +5,12 @@
# the root directory of this source tree.
import asyncio
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from llama_stack.apis.files import Files
-from llama_stack.apis.inference import EmbeddingsResponse, Inference
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
from llama_stack.providers.datatypes import HealthStatus
@@ -70,13 +69,6 @@ def mock_vector_db(vector_db_id, embedding_dimension) -> MagicMock:
return mock_vector_db
-@pytest.fixture
-def mock_inference_api(sample_embeddings):
- mock_api = MagicMock(spec=Inference)
- mock_api.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings))
- return mock_api
-
-
@pytest.fixture
def mock_files_api():
mock_api = MagicMock(spec=Files)
@@ -96,22 +88,6 @@ async def faiss_index(embedding_dimension):
yield index
-@pytest.fixture
-async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> FaissVectorIOAdapter:
- # Create the adapter
- adapter = FaissVectorIOAdapter(config=faiss_config, inference_api=mock_inference_api, files_api=mock_files_api)
-
- # Create a mock KVStore
- mock_kvstore = MagicMock()
- mock_kvstore.values_in_range = AsyncMock(return_value=[])
-
- # Patch the initialize method to avoid the kvstore_impl call
- with patch.object(FaissVectorIOAdapter, "initialize"):
- # Set the kvstore directly
- adapter.kvstore = mock_kvstore
- yield adapter
-
-
async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_identical(
faiss_index, sample_chunks, sample_embeddings, embedding_dimension
):