diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 9e28e0f42..7845fb068 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -1035,50 +1035,6 @@ ] } }, - "/v1/inference/embeddings": { - "post": { - "responses": { - "200": { - "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Inference" - ], - "summary": "Generate embeddings for content pieces using the specified model.", - "description": "Generate embeddings for content pieces using the specified model.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsRequest" - } - } - }, - "required": true - } - } - }, "/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": { "post": { "responses": { @@ -10547,80 +10503,6 @@ "title": "OpenAIDeleteResponseObject", "description": "Response object confirming deletion of an OpenAI response." }, - "EmbeddingsRequest": { - "type": "object", - "properties": { - "model_id": { - "type": "string", - "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." - }, - "contents": { - "oneOf": [ - { - "type": "array", - "items": { - "type": "string" - } - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContentItem" - } - } - ], - "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text." - }, - "text_truncation": { - "type": "string", - "enum": [ - "none", - "start", - "end" - ], - "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length." - }, - "output_dimension": { - "type": "integer", - "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models." - }, - "task_type": { - "type": "string", - "enum": [ - "query", - "document" - ], - "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models." - } - }, - "additionalProperties": false, - "required": [ - "model_id", - "contents" - ], - "title": "EmbeddingsRequest" - }, - "EmbeddingsResponse": { - "type": "object", - "properties": { - "embeddings": { - "type": "array", - "items": { - "type": "array", - "items": { - "type": "number" - } - }, - "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}" - } - }, - "additionalProperties": false, - "required": [ - "embeddings" - ], - "title": "EmbeddingsResponse", - "description": "Response containing generated embeddings." - }, "AgentCandidate": { "type": "object", "properties": { diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 1c06c74a5..8cbbccaa2 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -720,41 +720,6 @@ paths: required: true schema: type: string - /v1/inference/embeddings: - post: - responses: - '200': - description: >- - An array of embeddings, one for each content. Each embedding is a list - of floats. The dimensionality of the embedding is model-specific; you - can check model metadata using /models/{model_id}. - content: - application/json: - schema: - $ref: '#/components/schemas/EmbeddingsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Inference - summary: >- - Generate embeddings for content pieces using the specified model. - description: >- - Generate embeddings for content pieces using the specified model. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EmbeddingsRequest' - required: true /v1alpha/eval/benchmarks/{benchmark_id}/evaluations: post: responses: @@ -7795,72 +7760,6 @@ components: title: OpenAIDeleteResponseObject description: >- Response object confirming deletion of an OpenAI response. - EmbeddingsRequest: - type: object - properties: - model_id: - type: string - description: >- - The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - contents: - oneOf: - - type: array - items: - type: string - - type: array - items: - $ref: '#/components/schemas/InterleavedContentItem' - description: >- - List of contents to generate embeddings for. Each content can be a string - or an InterleavedContentItem (and hence can be multimodal). The behavior - depends on the model and provider. Some models may only support text. - text_truncation: - type: string - enum: - - none - - start - - end - description: >- - (Optional) Config for how to truncate text for embedding when text is - longer than the model's max sequence length. - output_dimension: - type: integer - description: >- - (Optional) Output dimensionality for the embeddings. Only supported by - Matryoshka models. - task_type: - type: string - enum: - - query - - document - description: >- - (Optional) How is the embedding being used? This is only supported by - asymmetric embedding models. - additionalProperties: false - required: - - model_id - - contents - title: EmbeddingsRequest - EmbeddingsResponse: - type: object - properties: - embeddings: - type: array - items: - type: array - items: - type: number - description: >- - List of embedding vectors, one per input content. Each embedding is a - list of floats. The dimensionality of the embedding is model-specific; - you can check model metadata using /models/{model_id} - additionalProperties: false - required: - - embeddings - title: EmbeddingsResponse - description: >- - Response containing generated embeddings. AgentCandidate: type: object properties: diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 756896796..c6a4e4f60 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -17,7 +17,7 @@ from typing import ( from pydantic import BaseModel, Field, field_validator from typing_extensions import TypedDict -from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem +from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent from llama_stack.apis.common.responses import Order from llama_stack.apis.models import Model from llama_stack.apis.telemetry import MetricResponseMixin @@ -1070,26 +1070,6 @@ class InferenceProvider(Protocol): """ ... - @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1) - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - """Generate embeddings for content pieces using the specified model. - - :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. - :param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text. - :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models. - :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length. - :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models. - :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}. - """ - ... - @webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1) async def rerank( self, diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index fcf01a9c4..80f47fb5d 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -16,7 +16,6 @@ from pydantic import Field, TypeAdapter from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError from llama_stack.apis.inference import ( @@ -26,8 +25,6 @@ from llama_stack.apis.inference import ( CompletionMessage, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, ListOpenAIChatCompletionResponse, LogProbConfig, @@ -48,7 +45,6 @@ from llama_stack.apis.inference import ( ResponseFormat, SamplingParams, StopReason, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -312,25 +308,6 @@ class InferenceRouter(Inference): return response - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - logger.debug(f"InferenceRouter.embeddings: {model_id}") - await self._get_model(model_id, ModelType.embedding) - provider = await self.routing_table.get_provider_impl(model_id) - return await provider.embeddings( - model_id=model_id, - contents=contents, - text_truncation=text_truncation, - output_dimension=output_dimension, - task_type=task_type, - ) - async def openai_completion( self, model: str, diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 29b935bbd..2206aa641 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -11,21 +11,17 @@ from botocore.client import BaseClient from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, OpenAIEmbeddingsResponse, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -47,8 +43,6 @@ from llama_stack.providers.utils.inference.openai_compat import ( ) from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, ) from .models import MODEL_ENTRIES @@ -218,36 +212,6 @@ class BedrockInferenceAdapter( ), } - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - - # Convert foundation model ID to inference profile ID - region_name = self.client.meta.region_name - inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name) - - embeddings = [] - for content in contents: - assert not content_has_media(content), "Bedrock does not support media for embeddings" - input_text = interleaved_content_as_str(content) - input_body = {"inputText": input_text} - body = json.dumps(input_body) - response = self.client.invoke_model( - body=body, - modelId=inference_profile_id, - accept="application/json", - contentType="application/json", - ) - response_body = json.loads(response.get("body").read()) - embeddings.append(response_body.get("embedding")) - return EmbeddingsResponse(embeddings=embeddings) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index 6662f004d..6be39fa5d 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -11,21 +11,17 @@ from cerebras.cloud.sdk import AsyncCerebras from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, CompletionRequest, CompletionResponse, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, OpenAIEmbeddingsResponse, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -187,16 +183,6 @@ class CerebrasInferenceAdapter( **get_sampling_options(request.sampling_params), } - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 6eac6e4f4..d85b477f5 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -11,15 +11,12 @@ from databricks.sdk import WorkspaceClient from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseStreamChunk, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +24,6 @@ from llama_stack.apis.inference import ( OpenAICompletion, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -118,16 +114,6 @@ class DatabricksInferenceAdapter( ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]: raise NotImplementedError() - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def list_models(self) -> list[Model] | None: self._model_cache = {} # from OpenAIMixin ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key()) # TODO: this is not async diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 069a0a674..ed4b56fad 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -10,22 +10,18 @@ from fireworks.client import Fireworks from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, CompletionResponse, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, ResponseFormatType, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -48,8 +44,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, request_has_media, ) @@ -259,28 +253,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee logger.debug(f"params to fireworks: {params}") return params - - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - - kwargs = {} - if model.metadata.get("embedding_dimension"): - kwargs["dimensions"] = model.metadata.get("embedding_dimension") - assert all(not content_has_media(content) for content in contents), ( - "Fireworks does not support media for embeddings" - ) - response = self._get_client().embeddings.create( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - **kwargs, - ) - - embeddings = [data.embedding for data in response.data] - return EmbeddingsResponse(embeddings=embeddings) diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index 92094a0f3..a31981adb 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -11,8 +11,6 @@ from openai import NOT_GIVEN, APIConnectionError from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, - TextContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, @@ -21,8 +19,6 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -31,7 +27,6 @@ from llama_stack.apis.inference import ( OpenAIEmbeddingUsage, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ) @@ -156,60 +151,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference): # we pass n=1 to get only one completion return convert_openai_completion_choice(response.choices[0]) - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - if any(content_has_media(content) for content in contents): - raise NotImplementedError("Media is not supported") - - # - # Llama Stack: contents = list[str] | list[InterleavedContentItem] - # -> - # OpenAI: input = str | list[str] - # - # we can ignore str and always pass list[str] to OpenAI - # - flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents] - input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents] - provider_model_id = await self._get_provider_model_id(model_id) - - extra_body = {} - - if text_truncation is not None: - text_truncation_options = { - TextTruncation.none: "NONE", - TextTruncation.end: "END", - TextTruncation.start: "START", - } - extra_body["truncate"] = text_truncation_options[text_truncation] - - if output_dimension is not None: - extra_body["dimensions"] = output_dimension - - if task_type is not None: - task_type_options = { - EmbeddingTaskType.document: "passage", - EmbeddingTaskType.query: "query", - } - extra_body["input_type"] = task_type_options[task_type] - - response = await self.client.embeddings.create( - model=provider_model_id, - input=input, - extra_body=extra_body, - ) - # - # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...) - # -> - # Llama Stack: EmbeddingsResponse(embeddings=list[list[float]]) - # - return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data]) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 3fb10445f..16b104fb5 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -14,7 +14,6 @@ from ollama import AsyncClient as AsyncOllamaClient from llama_stack.apis.common.content_types import ( ImageContentItem, InterleavedContent, - InterleavedContentItem, TextContentItem, ) from llama_stack.apis.common.errors import UnsupportedModelError @@ -25,8 +24,6 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, GrammarResponseFormat, InferenceProvider, JsonSchemaResponseFormat, @@ -34,7 +31,6 @@ from llama_stack.apis.inference import ( Message, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -66,9 +62,7 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, completion_request_to_prompt, - content_has_media, convert_image_content_to_url, - interleaved_content_as_str, request_has_media, ) @@ -363,27 +357,6 @@ class OllamaInferenceAdapter( async for chunk in process_chat_completion_stream_response(stream, request): yield chunk - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self._get_model(model_id) - - assert all(not content_has_media(content) for content in contents), ( - "Ollama does not support media for embeddings" - ) - response = await self.ollama_client.embed( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - ) - embeddings = response["embeddings"] - - return EmbeddingsResponse(embeddings=embeddings) - async def register_model(self, model: Model) -> Model: if await self.check_model_availability(model.provider_model_id): return model diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index a2bdf0369..ae482b7b0 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -14,8 +14,6 @@ from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseStreamChunk, CompletionMessage, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +25,6 @@ from llama_stack.apis.inference import ( OpenAIResponseFormatParam, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -190,25 +187,6 @@ class PassthroughInferenceAdapter(Inference): chunk = convert_to_pydantic(ChatCompletionResponseStreamChunk, chunk) yield chunk - async def embeddings( - self, - model_id: str, - contents: list[InterleavedContent], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - client = self._get_client() - model = await self.model_store.get_model(model_id) - - return await client.inference.embeddings( - model_id=model.provider_resource_id, - contents=contents, - text_truncation=text_truncation, - output_dimension=output_dimension, - task_type=task_type, - ) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index ff2fe6401..82252b04d 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -136,16 +136,6 @@ class RunpodInferenceAdapter( **get_sampling_options(request.sampling_params), } - async def embeddings( - self, - model: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 27597900f..e1632e4a0 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -12,14 +12,11 @@ from pydantic import SecretStr from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +24,6 @@ from llama_stack.apis.inference import ( ResponseFormat, ResponseFormatType, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -306,16 +302,6 @@ class _HfAdapter( **self._build_options(request.sampling_params, request.response_format), ) - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index c199677be..083c528bb 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -12,14 +12,11 @@ from together.constants import BASE_URL from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +24,6 @@ from llama_stack.apis.inference import ( ResponseFormat, ResponseFormatType, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -50,8 +46,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, request_has_media, ) @@ -247,26 +241,6 @@ class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Need logger.debug(f"params to together: {params}") return params - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - assert all(not content_has_media(content) for content in contents), ( - "Together does not support media for embeddings" - ) - client = self._get_client() - r = await client.embeddings.create( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - ) - embeddings = [item.embedding for item in r.data] - return EmbeddingsResponse(embeddings=embeddings) - async def list_models(self) -> list[Model] | None: self._model_cache = {} # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 8fbb4b815..bef5cbf2c 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -16,7 +16,6 @@ from openai.types.chat.chat_completion_chunk import ( from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, TextDelta, ToolCallDelta, ToolCallParseStatus, @@ -31,8 +30,6 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, GrammarResponseFormat, Inference, JsonSchemaResponseFormat, @@ -41,7 +38,6 @@ from llama_stack.apis.inference import ( ModelStore, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -74,8 +70,6 @@ from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, request_has_media, ) @@ -550,27 +544,3 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro "stream": request.stream, **options, } - - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self._get_model(model_id) - - kwargs = {} - assert model.model_type == ModelType.embedding - assert model.metadata.get("embedding_dimension") - kwargs["dimensions"] = model.metadata.get("embedding_dimension") - assert all(not content_has_media(content) for content in contents), "VLLM does not support media for embeddings" - response = await self.client.embeddings.create( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - **kwargs, - ) - - embeddings = [data.embedding for data in response.data] - return EmbeddingsResponse(embeddings=embeddings) diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py index cb8b45565..00b9acc06 100644 --- a/llama_stack/providers/remote/inference/watsonx/watsonx.py +++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -11,13 +11,11 @@ from ibm_watsonx_ai.foundation_models import Model from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams from openai import AsyncOpenAI -from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem +from llama_stack.apis.common.content_types import InterleavedContent from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingsResponse, - EmbeddingTaskType, GreedySamplingStrategy, Inference, LogProbConfig, @@ -30,7 +28,6 @@ from llama_stack.apis.inference import ( OpenAIResponseFormatParam, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -265,16 +262,6 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper): } return params - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError("embedding is not supported for watsonx") - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py index 9bd0aa8ce..facc59f65 100644 --- a/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/llama_stack/providers/utils/inference/embedding_mixin.py @@ -15,16 +15,11 @@ if TYPE_CHECKING: from sentence_transformers import SentenceTransformer from llama_stack.apis.inference import ( - EmbeddingsResponse, - EmbeddingTaskType, - InterleavedContentItem, ModelStore, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, - TextTruncation, ) -from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str EMBEDDING_MODELS = {} @@ -35,23 +30,6 @@ log = get_logger(name=__name__, category="providers::utils") class SentenceTransformerEmbeddingMixin: model_store: ModelStore - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id) - embeddings = await asyncio.to_thread( - embedding_model.encode, - [interleaved_content_as_str(content) for content in contents], - show_progress_bar=False, - ) - return EmbeddingsResponse(embeddings=embeddings) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index b1e38f323..966081e9f 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -11,14 +11,11 @@ import litellm from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, InferenceProvider, JsonSchemaResponseFormat, LogProbConfig, @@ -32,7 +29,6 @@ from llama_stack.apis.inference import ( OpenAIResponseFormatParam, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -50,9 +46,6 @@ from llama_stack.providers.utils.inference.openai_compat import ( get_sampling_options, prepare_openai_completion_params, ) -from llama_stack.providers.utils.inference.prompt_adapter import ( - interleaved_content_as_str, -) logger = get_logger(name=__name__, category="providers::utils") @@ -269,24 +262,6 @@ class LiteLLMOpenAIMixin( ) return api_key - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - - response = litellm.embedding( - model=self.get_litellm_model_name(model.provider_resource_id), - input=[interleaved_content_as_str(content) for content in contents], - ) - - embeddings = [data["embedding"] for data in response["data"]] - return EmbeddingsResponse(embeddings=embeddings) - async def openai_embeddings( self, model: str, diff --git a/tests/unit/providers/vector_io/test_faiss.py b/tests/unit/providers/vector_io/test_faiss.py index 90108d7a0..9ee5c82f4 100644 --- a/tests/unit/providers/vector_io/test_faiss.py +++ b/tests/unit/providers/vector_io/test_faiss.py @@ -5,13 +5,12 @@ # the root directory of this source tree. import asyncio -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import MagicMock, patch import numpy as np import pytest from llama_stack.apis.files import Files -from llama_stack.apis.inference import EmbeddingsResponse, Inference from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_io import Chunk, QueryChunksResponse from llama_stack.providers.datatypes import HealthStatus @@ -70,13 +69,6 @@ def mock_vector_db(vector_db_id, embedding_dimension) -> MagicMock: return mock_vector_db -@pytest.fixture -def mock_inference_api(sample_embeddings): - mock_api = MagicMock(spec=Inference) - mock_api.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings)) - return mock_api - - @pytest.fixture def mock_files_api(): mock_api = MagicMock(spec=Files) @@ -96,22 +88,6 @@ async def faiss_index(embedding_dimension): yield index -@pytest.fixture -async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> FaissVectorIOAdapter: - # Create the adapter - adapter = FaissVectorIOAdapter(config=faiss_config, inference_api=mock_inference_api, files_api=mock_files_api) - - # Create a mock KVStore - mock_kvstore = MagicMock() - mock_kvstore.values_in_range = AsyncMock(return_value=[]) - - # Patch the initialize method to avoid the kvstore_impl call - with patch.object(FaissVectorIOAdapter, "initialize"): - # Set the kvstore directly - adapter.kvstore = mock_kvstore - yield adapter - - async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_identical( faiss_index, sample_chunks, sample_embeddings, embedding_dimension ):