chore(api): remove deprecated embeddings impls (#3301)

# What does this PR do? remove deprecated embeddings implementations
2025-12-03 09:53:45 +00:00 · 2025-09-29 14:45:09 -04:00 · 2025-09-29 14:45:09 -04:00 · 975ead1d6a
commit 975ead1d6a
parent aab22dc759
19 changed files with 3 additions and 632 deletions
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -17,7 +17,7 @@ from typing import (
 from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict

-from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem
+from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry import MetricResponseMixin
@ -1070,26 +1070,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        """Generate embeddings for content pieces using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
-        :param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text.
-        :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
-        :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
-        :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
-        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.
-        """
-        ...
-
    @webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
    async def rerank(
        self,