diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
index 9e28e0f42..7845fb068 100644
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@@ -1035,50 +1035,6 @@
                 ]
             }
         },
-        "/v1/inference/embeddings": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EmbeddingsResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate embeddings for content pieces using the specified model.",
-                "description": "Generate embeddings for content pieces using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EmbeddingsRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
@@ -10547,80 +10503,6 @@
                 "title": "OpenAIDeleteResponseObject",
                 "description": "Response object confirming deletion of an OpenAI response."
             },
-            "EmbeddingsRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "contents": {
-                        "oneOf": [
-                            {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                }
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/InterleavedContentItem"
-                                }
-                            }
-                        ],
-                        "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
-                    },
-                    "text_truncation": {
-                        "type": "string",
-                        "enum": [
-                            "none",
-                            "start",
-                            "end"
-                        ],
-                        "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
-                    },
-                    "output_dimension": {
-                        "type": "integer",
-                        "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
-                    },
-                    "task_type": {
-                        "type": "string",
-                        "enum": [
-                            "query",
-                            "document"
-                        ],
-                        "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "contents"
-                ],
-                "title": "EmbeddingsRequest"
-            },
-            "EmbeddingsResponse": {
-                "type": "object",
-                "properties": {
-                    "embeddings": {
-                        "type": "array",
-                        "items": {
-                            "type": "array",
-                            "items": {
-                                "type": "number"
-                            }
-                        },
-                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "embeddings"
-                ],
-                "title": "EmbeddingsResponse",
-                "description": "Response containing generated embeddings."
-            },
             "AgentCandidate": {
                 "type": "object",
                 "properties": {
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 1c06c74a5..8cbbccaa2 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -720,41 +720,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/inference/embeddings:
-    post:
-      responses:
-        '200':
-          description: >-
-            An array of embeddings, one for each content. Each embedding is a list
-            of floats. The dimensionality of the embedding is model-specific; you
-            can check model metadata using /models/{model_id}.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EmbeddingsResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate embeddings for content pieces using the specified model.
-      description: >-
-        Generate embeddings for content pieces using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EmbeddingsRequest'
-        required: true
   /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
     post:
       responses:
@@ -7795,72 +7760,6 @@ components:
       title: OpenAIDeleteResponseObject
       description: >-
         Response object confirming deletion of an OpenAI response.
-    EmbeddingsRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be an embedding model
-            registered with Llama Stack and available via the /models endpoint.
-        contents:
-          oneOf:
-            - type: array
-              items:
-                type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-          description: >-
-            List of contents to generate embeddings for. Each content can be a string
-            or an InterleavedContentItem (and hence can be multimodal). The behavior
-            depends on the model and provider. Some models may only support text.
-        text_truncation:
-          type: string
-          enum:
-            - none
-            - start
-            - end
-          description: >-
-            (Optional) Config for how to truncate text for embedding when text is
-            longer than the model's max sequence length.
-        output_dimension:
-          type: integer
-          description: >-
-            (Optional) Output dimensionality for the embeddings. Only supported by
-            Matryoshka models.
-        task_type:
-          type: string
-          enum:
-            - query
-            - document
-          description: >-
-            (Optional) How is the embedding being used? This is only supported by
-            asymmetric embedding models.
-      additionalProperties: false
-      required:
-        - model_id
-        - contents
-      title: EmbeddingsRequest
-    EmbeddingsResponse:
-      type: object
-      properties:
-        embeddings:
-          type: array
-          items:
-            type: array
-            items:
-              type: number
-          description: >-
-            List of embedding vectors, one per input content. Each embedding is a
-            list of floats. The dimensionality of the embedding is model-specific;
-            you can check model metadata using /models/{model_id}
-      additionalProperties: false
-      required:
-        - embeddings
-      title: EmbeddingsResponse
-      description: >-
-        Response containing generated embeddings.
     AgentCandidate:
       type: object
       properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 756896796..c6a4e4f60 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -17,7 +17,7 @@ from typing import (
 from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict
 
-from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem
+from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry import MetricResponseMixin
@@ -1070,26 +1070,6 @@ class InferenceProvider(Protocol):
         """
         ...
 
-    @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        """Generate embeddings for content pieces using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
-        :param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text.
-        :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
-        :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
-        :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
-        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.
-        """
-        ...
-
     @webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
     async def rerank(
         self,
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index fcf01a9c4..80f47fb5d 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -16,7 +16,6 @@ from pydantic import Field, TypeAdapter
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
 from llama_stack.apis.inference import (
@@ -26,8 +25,6 @@ from llama_stack.apis.inference import (
     CompletionMessage,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     ListOpenAIChatCompletionResponse,
     LogProbConfig,
@@ -48,7 +45,6 @@ from llama_stack.apis.inference import (
     ResponseFormat,
     SamplingParams,
     StopReason,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -312,25 +308,6 @@ class InferenceRouter(Inference):
 
         return response
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        logger.debug(f"InferenceRouter.embeddings: {model_id}")
-        await self._get_model(model_id, ModelType.embedding)
-        provider = await self.routing_table.get_provider_impl(model_id)
-        return await provider.embeddings(
-            model_id=model_id,
-            contents=contents,
-            text_truncation=text_truncation,
-            output_dimension=output_dimension,
-            task_type=task_type,
-        )
-
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 29b935bbd..2206aa641 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -11,21 +11,17 @@ from botocore.client import BaseClient
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     ChatCompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
     OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -47,8 +43,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
     chat_completion_request_to_prompt,
-    content_has_media,
-    interleaved_content_as_str,
 )
 
 from .models import MODEL_ENTRIES
@@ -218,36 +212,6 @@ class BedrockInferenceAdapter(
             ),
         }
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self.model_store.get_model(model_id)
-
-        # Convert foundation model ID to inference profile ID
-        region_name = self.client.meta.region_name
-        inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
-
-        embeddings = []
-        for content in contents:
-            assert not content_has_media(content), "Bedrock does not support media for embeddings"
-            input_text = interleaved_content_as_str(content)
-            input_body = {"inputText": input_text}
-            body = json.dumps(input_body)
-            response = self.client.invoke_model(
-                body=body,
-                modelId=inference_profile_id,
-                accept="application/json",
-                contentType="application/json",
-            )
-            response_body = json.loads(response.get("body").read())
-            embeddings.append(response_body.get("embedding"))
-        return EmbeddingsResponse(embeddings=embeddings)
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 6662f004d..6be39fa5d 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -11,21 +11,17 @@ from cerebras.cloud.sdk import AsyncCerebras
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     CompletionRequest,
     CompletionResponse,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
     OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -187,16 +183,6 @@ class CerebrasInferenceAdapter(
             **get_sampling_options(request.sampling_params),
         }
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 6eac6e4f4..d85b477f5 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -11,15 +11,12 @@ from databricks.sdk import WorkspaceClient
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionResponse,
     ChatCompletionResponseStreamChunk,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
@@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
     OpenAICompletion,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -118,16 +114,6 @@ class DatabricksInferenceAdapter(
     ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         raise NotImplementedError()
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
     async def list_models(self) -> list[Model] | None:
         self._model_cache = {}  # from OpenAIMixin
         ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key())  # TODO: this is not async
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 069a0a674..ed4b56fad 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -10,22 +10,18 @@ from fireworks.client import Fireworks
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     CompletionRequest,
     CompletionResponse,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -48,8 +44,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
     chat_completion_request_to_prompt,
     completion_request_to_prompt,
-    content_has_media,
-    interleaved_content_as_str,
     request_has_media,
 )
 
@@ -259,28 +253,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
         logger.debug(f"params to fireworks: {params}")
 
         return params
-
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self.model_store.get_model(model_id)
-
-        kwargs = {}
-        if model.metadata.get("embedding_dimension"):
-            kwargs["dimensions"] = model.metadata.get("embedding_dimension")
-        assert all(not content_has_media(content) for content in contents), (
-            "Fireworks does not support media for embeddings"
-        )
-        response = self._get_client().embeddings.create(
-            model=model.provider_resource_id,
-            input=[interleaved_content_as_str(content) for content in contents],
-            **kwargs,
-        )
-
-        embeddings = [data.embedding for data in response.data]
-        return EmbeddingsResponse(embeddings=embeddings)
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 92094a0f3..a31981adb 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -11,8 +11,6 @@ from openai import NOT_GIVEN, APIConnectionError
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
-    TextContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
@@ -21,8 +19,6 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
@@ -31,7 +27,6 @@ from llama_stack.apis.inference import (
     OpenAIEmbeddingUsage,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
 )
@@ -156,60 +151,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
             # we pass n=1 to get only one completion
             return convert_openai_completion_choice(response.choices[0])
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        if any(content_has_media(content) for content in contents):
-            raise NotImplementedError("Media is not supported")
-
-        #
-        # Llama Stack: contents = list[str] | list[InterleavedContentItem]
-        #  ->
-        # OpenAI: input = str | list[str]
-        #
-        # we can ignore str and always pass list[str] to OpenAI
-        #
-        flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
-        input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
-        provider_model_id = await self._get_provider_model_id(model_id)
-
-        extra_body = {}
-
-        if text_truncation is not None:
-            text_truncation_options = {
-                TextTruncation.none: "NONE",
-                TextTruncation.end: "END",
-                TextTruncation.start: "START",
-            }
-            extra_body["truncate"] = text_truncation_options[text_truncation]
-
-        if output_dimension is not None:
-            extra_body["dimensions"] = output_dimension
-
-        if task_type is not None:
-            task_type_options = {
-                EmbeddingTaskType.document: "passage",
-                EmbeddingTaskType.query: "query",
-            }
-            extra_body["input_type"] = task_type_options[task_type]
-
-        response = await self.client.embeddings.create(
-            model=provider_model_id,
-            input=input,
-            extra_body=extra_body,
-        )
-        #
-        # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
-        #  ->
-        # Llama Stack: EmbeddingsResponse(embeddings=list[list[float]])
-        #
-        return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 3fb10445f..16b104fb5 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -14,7 +14,6 @@ from ollama import AsyncClient as AsyncOllamaClient
 from llama_stack.apis.common.content_types import (
     ImageContentItem,
     InterleavedContent,
-    InterleavedContentItem,
     TextContentItem,
 )
 from llama_stack.apis.common.errors import UnsupportedModelError
@@ -25,8 +24,6 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     GrammarResponseFormat,
     InferenceProvider,
     JsonSchemaResponseFormat,
@@ -34,7 +31,6 @@ from llama_stack.apis.inference import (
     Message,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -66,9 +62,7 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
     chat_completion_request_to_prompt,
     completion_request_to_prompt,
-    content_has_media,
     convert_image_content_to_url,
-    interleaved_content_as_str,
     request_has_media,
 )
 
@@ -363,27 +357,6 @@ class OllamaInferenceAdapter(
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self._get_model(model_id)
-
-        assert all(not content_has_media(content) for content in contents), (
-            "Ollama does not support media for embeddings"
-        )
-        response = await self.ollama_client.embed(
-            model=model.provider_resource_id,
-            input=[interleaved_content_as_str(content) for content in contents],
-        )
-        embeddings = response["embeddings"]
-
-        return EmbeddingsResponse(embeddings=embeddings)
-
     async def register_model(self, model: Model) -> Model:
         if await self.check_model_availability(model.provider_model_id):
             return model
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index a2bdf0369..ae482b7b0 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -14,8 +14,6 @@ from llama_stack.apis.inference import (
     ChatCompletionResponse,
     ChatCompletionResponseStreamChunk,
     CompletionMessage,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
@@ -27,7 +25,6 @@ from llama_stack.apis.inference import (
     OpenAIResponseFormatParam,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -190,25 +187,6 @@ class PassthroughInferenceAdapter(Inference):
             chunk = convert_to_pydantic(ChatCompletionResponseStreamChunk, chunk)
             yield chunk
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[InterleavedContent],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        client = self._get_client()
-        model = await self.model_store.get_model(model_id)
-
-        return await client.inference.embeddings(
-            model_id=model.provider_resource_id,
-            contents=contents,
-            text_truncation=text_truncation,
-            output_dimension=output_dimension,
-            task_type=task_type,
-        )
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index ff2fe6401..82252b04d 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -136,16 +136,6 @@ class RunpodInferenceAdapter(
             **get_sampling_options(request.sampling_params),
         }
 
-    async def embeddings(
-        self,
-        model: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 27597900f..e1632e4a0 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -12,14 +12,11 @@ from pydantic import SecretStr
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     CompletionRequest,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
@@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -306,16 +302,6 @@ class _HfAdapter(
             **self._build_options(request.sampling_params, request.response_format),
         )
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index c199677be..083c528bb 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -12,14 +12,11 @@ from together.constants import BASE_URL
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     CompletionRequest,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     Inference,
     LogProbConfig,
     Message,
@@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -50,8 +46,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
     chat_completion_request_to_prompt,
     completion_request_to_prompt,
-    content_has_media,
-    interleaved_content_as_str,
     request_has_media,
 )
 
@@ -247,26 +241,6 @@ class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Need
         logger.debug(f"params to together: {params}")
         return params
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self.model_store.get_model(model_id)
-        assert all(not content_has_media(content) for content in contents), (
-            "Together does not support media for embeddings"
-        )
-        client = self._get_client()
-        r = await client.embeddings.create(
-            model=model.provider_resource_id,
-            input=[interleaved_content_as_str(content) for content in contents],
-        )
-        embeddings = [item.embedding for item in r.data]
-        return EmbeddingsResponse(embeddings=embeddings)
-
     async def list_models(self) -> list[Model] | None:
         self._model_cache = {}
         # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 8fbb4b815..bef5cbf2c 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -16,7 +16,6 @@ from openai.types.chat.chat_completion_chunk import (
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
     TextDelta,
     ToolCallDelta,
     ToolCallParseStatus,
@@ -31,8 +30,6 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     GrammarResponseFormat,
     Inference,
     JsonSchemaResponseFormat,
@@ -41,7 +38,6 @@ from llama_stack.apis.inference import (
     ModelStore,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -74,8 +70,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
     completion_request_to_prompt,
-    content_has_media,
-    interleaved_content_as_str,
     request_has_media,
 )
 
@@ -550,27 +544,3 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
             "stream": request.stream,
             **options,
         }
-
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self._get_model(model_id)
-
-        kwargs = {}
-        assert model.model_type == ModelType.embedding
-        assert model.metadata.get("embedding_dimension")
-        kwargs["dimensions"] = model.metadata.get("embedding_dimension")
-        assert all(not content_has_media(content) for content in contents), "VLLM does not support media for embeddings"
-        response = await self.client.embeddings.create(
-            model=model.provider_resource_id,
-            input=[interleaved_content_as_str(content) for content in contents],
-            **kwargs,
-        )
-
-        embeddings = [data.embedding for data in response.data]
-        return EmbeddingsResponse(embeddings=embeddings)
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index cb8b45565..00b9acc06 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -11,13 +11,11 @@ from ibm_watsonx_ai.foundation_models import Model
 from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
 from openai import AsyncOpenAI
 
-from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
+from llama_stack.apis.common.content_types import InterleavedContent
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     CompletionRequest,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     GreedySamplingStrategy,
     Inference,
     LogProbConfig,
@@ -30,7 +28,6 @@ from llama_stack.apis.inference import (
     OpenAIResponseFormatParam,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -265,16 +262,6 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
         }
         return params
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError("embedding is not supported for watsonx")
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py
index 9bd0aa8ce..facc59f65 100644
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@@ -15,16 +15,11 @@ if TYPE_CHECKING:
     from sentence_transformers import SentenceTransformer
 
 from llama_stack.apis.inference import (
-    EmbeddingsResponse,
-    EmbeddingTaskType,
-    InterleavedContentItem,
     ModelStore,
     OpenAIEmbeddingData,
     OpenAIEmbeddingsResponse,
     OpenAIEmbeddingUsage,
-    TextTruncation,
 )
-from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 
 EMBEDDING_MODELS = {}
 
@@ -35,23 +30,6 @@ log = get_logger(name=__name__, category="providers::utils")
 class SentenceTransformerEmbeddingMixin:
     model_store: ModelStore
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self.model_store.get_model(model_id)
-        embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
-        embeddings = await asyncio.to_thread(
-            embedding_model.encode,
-            [interleaved_content_as_str(content) for content in contents],
-            show_progress_bar=False,
-        )
-        return EmbeddingsResponse(embeddings=embeddings)
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index b1e38f323..966081e9f 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -11,14 +11,11 @@ import litellm
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
-    InterleavedContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     ChatCompletionResponseStreamChunk,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
     InferenceProvider,
     JsonSchemaResponseFormat,
     LogProbConfig,
@@ -32,7 +29,6 @@ from llama_stack.apis.inference import (
     OpenAIResponseFormatParam,
     ResponseFormat,
     SamplingParams,
-    TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
@@ -50,9 +46,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
     get_sampling_options,
     prepare_openai_completion_params,
 )
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    interleaved_content_as_str,
-)
 
 logger = get_logger(name=__name__, category="providers::utils")
 
@@ -269,24 +262,6 @@ class LiteLLMOpenAIMixin(
             )
         return api_key
 
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        model = await self.model_store.get_model(model_id)
-
-        response = litellm.embedding(
-            model=self.get_litellm_model_name(model.provider_resource_id),
-            input=[interleaved_content_as_str(content) for content in contents],
-        )
-
-        embeddings = [data["embedding"] for data in response["data"]]
-        return EmbeddingsResponse(embeddings=embeddings)
-
     async def openai_embeddings(
         self,
         model: str,
diff --git a/tests/unit/providers/vector_io/test_faiss.py b/tests/unit/providers/vector_io/test_faiss.py
index 90108d7a0..9ee5c82f4 100644
--- a/tests/unit/providers/vector_io/test_faiss.py
+++ b/tests/unit/providers/vector_io/test_faiss.py
@@ -5,13 +5,12 @@
 # the root directory of this source tree.
 
 import asyncio
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock, patch
 
 import numpy as np
 import pytest
 
 from llama_stack.apis.files import Files
-from llama_stack.apis.inference import EmbeddingsResponse, Inference
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
 from llama_stack.providers.datatypes import HealthStatus
@@ -70,13 +69,6 @@ def mock_vector_db(vector_db_id, embedding_dimension) -> MagicMock:
     return mock_vector_db
 
 
-@pytest.fixture
-def mock_inference_api(sample_embeddings):
-    mock_api = MagicMock(spec=Inference)
-    mock_api.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings))
-    return mock_api
-
-
 @pytest.fixture
 def mock_files_api():
     mock_api = MagicMock(spec=Files)
@@ -96,22 +88,6 @@ async def faiss_index(embedding_dimension):
     yield index
 
 
-@pytest.fixture
-async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> FaissVectorIOAdapter:
-    # Create the adapter
-    adapter = FaissVectorIOAdapter(config=faiss_config, inference_api=mock_inference_api, files_api=mock_files_api)
-
-    # Create a mock KVStore
-    mock_kvstore = MagicMock()
-    mock_kvstore.values_in_range = AsyncMock(return_value=[])
-
-    # Patch the initialize method to avoid the kvstore_impl call
-    with patch.object(FaissVectorIOAdapter, "initialize"):
-        # Set the kvstore directly
-        adapter.kvstore = mock_kvstore
-        yield adapter
-
-
 async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_identical(
     faiss_index, sample_chunks, sample_embeddings, embedding_dimension
 ):