diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 638f7bb7b..fab7c802e 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4944,6 +4944,27 @@
}
],
"description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
+ },
+ "text_truncation": {
+ "type": "string",
+ "enum": [
+ "none",
+ "start",
+ "end"
+ ],
+ "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
+ },
+ "output_dimension": {
+ "type": "integer",
+ "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
+ },
+ "task_type": {
+ "type": "string",
+ "enum": [
+ "query",
+ "document"
+ ],
+ "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
}
},
"additionalProperties": false,
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 08effe7cf..fc57bf258 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -3235,6 +3235,28 @@ components:
List of contents to generate embeddings for. Each content can be a string
or an InterleavedContentItem (and hence can be multimodal). The behavior
depends on the model and provider. Some models may only support text.
+ text_truncation:
+ type: string
+ enum:
+ - none
+ - start
+ - end
+ description: >-
+ (Optional) Config for how to truncate text for embedding when text is
+ longer than the model's max sequence length.
+ output_dimension:
+ type: integer
+ description: >-
+ (Optional) Output dimensionality for the embeddings. Only supported by
+ Matryoshka models.
+ task_type:
+ type: string
+ enum:
+ - query
+ - document
+ description: >-
+ (Optional) How is the embedding being used? This is only supported by
+ asymmetric embedding models.
additionalProperties: false
required:
- model_id
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 2dfe55977..d83506dd4 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -402,6 +402,30 @@ class ModelStore(Protocol):
def get_model(self, identifier: str) -> Model: ...
+class TextTruncation(Enum):
+ """Config for how to truncate text for embedding when text is longer than the model's max sequence length. Start and End semantics depend on whether the language is left-to-right or right-to-left.
+
+ :cvar none: No truncation (default). If the text is longer than the model's max sequence length, you will get an error.
+ :cvar start: Truncate from the start
+ :cvar end: Truncate from the end
+ """
+
+ none = "none"
+ start = "start"
+ end = "end"
+
+
+class EmbeddingTaskType(Enum):
+ """How is the embedding being used? This is only supported by asymmetric embedding models.
+
+ :cvar query: Used for a query for semantic search.
+ :cvar document: Used at indexing time when ingesting documents.
+ """
+
+ query = "query"
+ document = "document"
+
+
@runtime_checkable
@trace_protocol
class Inference(Protocol):
@@ -482,11 +506,17 @@ class Inference(Protocol):
self,
model_id: str,
contents: List[str] | List[InterleavedContentItem],
+ text_truncation: Optional[TextTruncation] = TextTruncation.none,
+ output_dimension: Optional[int] = None,
+ task_type: Optional[EmbeddingTaskType] = None,
) -> EmbeddingsResponse:
"""Generate embeddings for content pieces using the specified model.
:param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text.
+ :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
+ :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
+ :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
...