feat: New OpenAI compat embeddings API (#2314)

# What does this PR do? Adds a new endpoint that is compatible with OpenAI for embeddings api. `/openai/v1/embeddings` Added providers for OpenAI, LiteLLM and SentenceTransformer. ## Test Plan ``` LLAMA_STACK_CONFIG=http://localhost:8321 pytest -sv tests/integration/inference/test_openai_embeddings.py --embedding-model all-MiniLM-L6-v2,text-embedding-3-small,gemini/text-embedding-004 ```
2025-12-03 09:53:45 +00:00 · 2025-05-31 22:11:47 -07:00 · 2025-05-31 22:11:47 -07:00 · b21050935e
commit b21050935e
parent 277f8690ef
21 changed files with 981 additions and 0 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -3607,6 +3607,49 @@
                }
            }
        },
+        "/v1/openai/v1/embeddings": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIEmbeddingsResponse containing the embeddings.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIEmbeddingsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Inference"
+                ],
+                "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/OpenaiEmbeddingsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/openai/v1/models": {
            "get": {
                "responses": {
@ -11777,6 +11820,139 @@
                "title": "OpenAICompletionChoice",
                "description": "A choice from an OpenAI-compatible completion response."
            },
+            "OpenaiEmbeddingsRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+                    },
+                    "input": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        ],
+                        "description": "Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings."
+                    },
+                    "encoding_format": {
+                        "type": "string",
+                        "description": "(Optional) The format to return the embeddings in. Can be either \"float\" or \"base64\". Defaults to \"float\"."
+                    },
+                    "dimensions": {
+                        "type": "integer",
+                        "description": "(Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models."
+                    },
+                    "user": {
+                        "type": "string",
+                        "description": "(Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "input"
+                ],
+                "title": "OpenaiEmbeddingsRequest"
+            },
+            "OpenAIEmbeddingData": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "embedding",
+                        "default": "embedding",
+                        "description": "The object type, which will be \"embedding\""
+                    },
+                    "embedding": {
+                        "oneOf": [
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "number"
+                                }
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ],
+                        "description": "The embedding vector as a list of floats (when encoding_format=\"float\") or as a base64-encoded string (when encoding_format=\"base64\")"
+                    },
+                    "index": {
+                        "type": "integer",
+                        "description": "The index of the embedding in the input list"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "embedding",
+                    "index"
+                ],
+                "title": "OpenAIEmbeddingData",
+                "description": "A single embedding data object from an OpenAI-compatible embeddings response."
+            },
+            "OpenAIEmbeddingUsage": {
+                "type": "object",
+                "properties": {
+                    "prompt_tokens": {
+                        "type": "integer",
+                        "description": "The number of tokens in the input"
+                    },
+                    "total_tokens": {
+                        "type": "integer",
+                        "description": "The total number of tokens used"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "prompt_tokens",
+                    "total_tokens"
+                ],
+                "title": "OpenAIEmbeddingUsage",
+                "description": "Usage information for an OpenAI-compatible embeddings response."
+            },
+            "OpenAIEmbeddingsResponse": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "list",
+                        "default": "list",
+                        "description": "The object type, which will be \"list\""
+                    },
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIEmbeddingData"
+                        },
+                        "description": "List of embedding data objects"
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The model that was used to generate the embeddings"
+                    },
+                    "usage": {
+                        "$ref": "#/components/schemas/OpenAIEmbeddingUsage",
+                        "description": "Usage information"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "data",
+                    "model",
+                    "usage"
+                ],
+                "title": "OpenAIEmbeddingsResponse",
+                "description": "Response from an OpenAI-compatible embeddings request."
+            },
            "OpenAIModel": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2520,6 +2520,38 @@ paths:
            schema:
              $ref: '#/components/schemas/OpenaiCompletionRequest'
        required: true
+  /v1/openai/v1/embeddings:
+    post:
+      responses:
+        '200':
+          description: >-
+            An OpenAIEmbeddingsResponse containing the embeddings.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIEmbeddingsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: >-
+        Generate OpenAI-compatible embeddings for the given input using the specified
+        model.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/OpenaiEmbeddingsRequest'
+        required: true
  /v1/openai/v1/models:
    get:
      responses:
@ -8197,6 +8229,118 @@ components:
      title: OpenAICompletionChoice
      description: >-
        A choice from an OpenAI-compatible completion response.
+    OpenaiEmbeddingsRequest:
+      type: object
+      properties:
+        model:
+          type: string
+          description: >-
+            The identifier of the model to use. The model must be an embedding model
+            registered with Llama Stack and available via the /models endpoint.
+        input:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
+          description: >-
+            Input text to embed, encoded as a string or array of strings. To embed
+            multiple inputs in a single request, pass an array of strings.
+        encoding_format:
+          type: string
+          description: >-
+            (Optional) The format to return the embeddings in. Can be either "float"
+            or "base64". Defaults to "float".
+        dimensions:
+          type: integer
+          description: >-
+            (Optional) The number of dimensions the resulting output embeddings should
+            have. Only supported in text-embedding-3 and later models.
+        user:
+          type: string
+          description: >-
+            (Optional) A unique identifier representing your end-user, which can help
+            OpenAI to monitor and detect abuse.
+      additionalProperties: false
+      required:
+        - model
+        - input
+      title: OpenaiEmbeddingsRequest
+    OpenAIEmbeddingData:
+      type: object
+      properties:
+        object:
+          type: string
+          const: embedding
+          default: embedding
+          description: >-
+            The object type, which will be "embedding"
+        embedding:
+          oneOf:
+            - type: array
+              items:
+                type: number
+            - type: string
+          description: >-
+            The embedding vector as a list of floats (when encoding_format="float")
+            or as a base64-encoded string (when encoding_format="base64")
+        index:
+          type: integer
+          description: >-
+            The index of the embedding in the input list
+      additionalProperties: false
+      required:
+        - object
+        - embedding
+        - index
+      title: OpenAIEmbeddingData
+      description: >-
+        A single embedding data object from an OpenAI-compatible embeddings response.
+    OpenAIEmbeddingUsage:
+      type: object
+      properties:
+        prompt_tokens:
+          type: integer
+          description: The number of tokens in the input
+        total_tokens:
+          type: integer
+          description: The total number of tokens used
+      additionalProperties: false
+      required:
+        - prompt_tokens
+        - total_tokens
+      title: OpenAIEmbeddingUsage
+      description: >-
+        Usage information for an OpenAI-compatible embeddings response.
+    OpenAIEmbeddingsResponse:
+      type: object
+      properties:
+        object:
+          type: string
+          const: list
+          default: list
+          description: The object type, which will be "list"
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIEmbeddingData'
+          description: List of embedding data objects
+        model:
+          type: string
+          description: >-
+            The model that was used to generate the embeddings
+        usage:
+          $ref: '#/components/schemas/OpenAIEmbeddingUsage'
+          description: Usage information
+      additionalProperties: false
+      required:
+        - object
+        - data
+        - model
+        - usage
+      title: OpenAIEmbeddingsResponse
+      description: >-
+        Response from an OpenAI-compatible embeddings request.
    OpenAIModel:
      type: object
      properties: