diff --git a/docs/docs/references/python_sdk_reference/index.md b/docs/docs/references/python_sdk_reference/index.md
index b1a9396fe..e0b29363e 100644
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@@ -139,18 +139,7 @@ Methods:
- client.agents.turn.create(session_id, \*, agent_id, \*\*params) -> TurnCreateResponse
- client.agents.turn.retrieve(turn_id, \*, agent_id, session_id) -> Turn
-## BatchInference
-Types:
-
-```python
-from llama_stack_client.types import BatchInferenceChatCompletionResponse
-```
-
-Methods:
-
-- client.batch_inference.chat_completion(\*\*params) -> BatchInferenceChatCompletionResponse
-- client.batch_inference.completion(\*\*params) -> BatchCompletion
## Datasets
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index cdbf1c60c..758fe7e8f 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -548,7 +548,6 @@ class Generator:
if op.defining_class.__name__ in [
"SyntheticDataGeneration",
"PostTraining",
- "BatchInference",
]:
op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
print(op.defining_class.__name__)
diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
index dc433d380..457593729 100644
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@@ -161,6 +161,55 @@
}
}
},
+ "/v1/inference/chat-completion": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ChatCompletionResponse"
+ }
+ },
+ "text/event-stream": {
+ "schema": {
+ "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Inference"
+ ],
+ "summary": "Generate a chat completion for the given messages using the specified model.",
+ "description": "Generate a chat completion for the given messages using the specified model.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ChatCompletionRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/agents": {
"get": {
"responses": {
@@ -986,6 +1035,50 @@
]
}
},
+ "/v1/inference/embeddings": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EmbeddingsResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Inference"
+ ],
+ "summary": "Generate embeddings for content pieces using the specified model.",
+ "description": "Generate embeddings for content pieces using the specified model.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EmbeddingsRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": {
"post": {
"responses": {
@@ -6130,6 +6223,1033 @@
],
"title": "CancelTrainingJobRequest"
},
+ "CompletionMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "assistant",
+ "default": "assistant",
+ "description": "Must be \"assistant\" to identify this as the model's response"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the model's response"
+ },
+ "stop_reason": {
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+ },
+ "tool_calls": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ToolCall"
+ },
+ "description": "List of tool calls. Each tool call is a ToolCall object."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content",
+ "stop_reason"
+ ],
+ "title": "CompletionMessage",
+ "description": "A message containing the model's (assistant) response in a chat conversation."
+ },
+ "GrammarResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": [
+ "json_schema",
+ "grammar"
+ ],
+ "description": "Must be \"grammar\" to identify this format type",
+ "const": "grammar",
+ "default": "grammar"
+ },
+ "bnf": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The BNF grammar specification the response should conform to"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "bnf"
+ ],
+ "title": "GrammarResponseFormat",
+ "description": "Configuration for grammar-guided response generation."
+ },
+ "GreedySamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "greedy",
+ "default": "greedy",
+ "description": "Must be \"greedy\" to identify this sampling strategy"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "GreedySamplingStrategy",
+ "description": "Greedy sampling strategy that selects the highest probability token at each step."
+ },
+ "ImageContentItem": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "image",
+ "default": "image",
+ "description": "Discriminator type of the content item. Always \"image\""
+ },
+ "image": {
+ "type": "object",
+ "properties": {
+ "url": {
+ "$ref": "#/components/schemas/URL",
+ "description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
+ },
+ "data": {
+ "type": "string",
+ "contentEncoding": "base64",
+ "description": "base64 encoded image data as string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Image as a base64 encoded string or an URL"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "image"
+ ],
+ "title": "ImageContentItem",
+ "description": "A image content item"
+ },
+ "InterleavedContent": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/InterleavedContentItem"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/InterleavedContentItem"
+ }
+ }
+ ]
+ },
+ "InterleavedContentItem": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ImageContentItem"
+ },
+ {
+ "$ref": "#/components/schemas/TextContentItem"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "image": "#/components/schemas/ImageContentItem",
+ "text": "#/components/schemas/TextContentItem"
+ }
+ }
+ },
+ "JsonSchemaResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": [
+ "json_schema",
+ "grammar"
+ ],
+ "description": "Must be \"json_schema\" to identify this format type",
+ "const": "json_schema",
+ "default": "json_schema"
+ },
+ "json_schema": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "json_schema"
+ ],
+ "title": "JsonSchemaResponseFormat",
+ "description": "Configuration for JSON schema-guided response generation."
+ },
+ "Message": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "role",
+ "mapping": {
+ "user": "#/components/schemas/UserMessage",
+ "system": "#/components/schemas/SystemMessage",
+ "tool": "#/components/schemas/ToolResponseMessage",
+ "assistant": "#/components/schemas/CompletionMessage"
+ }
+ }
+ },
+ "ResponseFormat": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+ },
+ {
+ "$ref": "#/components/schemas/GrammarResponseFormat"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+ "grammar": "#/components/schemas/GrammarResponseFormat"
+ }
+ }
+ },
+ "SamplingParams": {
+ "type": "object",
+ "properties": {
+ "strategy": {
+ "$ref": "#/components/schemas/SamplingStrategy",
+ "description": "The sampling strategy."
+ },
+ "max_tokens": {
+ "type": "integer",
+ "default": 0,
+ "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
+ },
+ "repetition_penalty": {
+ "type": "number",
+ "default": 1.0,
+ "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
+ },
+ "stop": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "strategy"
+ ],
+ "title": "SamplingParams",
+ "description": "Sampling parameters."
+ },
+ "SamplingStrategy": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/GreedySamplingStrategy"
+ },
+ {
+ "$ref": "#/components/schemas/TopPSamplingStrategy"
+ },
+ {
+ "$ref": "#/components/schemas/TopKSamplingStrategy"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "greedy": "#/components/schemas/GreedySamplingStrategy",
+ "top_p": "#/components/schemas/TopPSamplingStrategy",
+ "top_k": "#/components/schemas/TopKSamplingStrategy"
+ }
+ }
+ },
+ "SystemMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "system",
+ "default": "system",
+ "description": "Must be \"system\" to identify this as a system message"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content"
+ ],
+ "title": "SystemMessage",
+ "description": "A system message providing instructions or context to the model."
+ },
+ "TextContentItem": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "text",
+ "default": "text",
+ "description": "Discriminator type of the content item. Always \"text\""
+ },
+ "text": {
+ "type": "string",
+ "description": "Text content"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "text"
+ ],
+ "title": "TextContentItem",
+ "description": "A text content item"
+ },
+ "ToolCall": {
+ "type": "object",
+ "properties": {
+ "call_id": {
+ "type": "string"
+ },
+ "tool_name": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ],
+ "title": "BuiltinTool"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "arguments": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "arguments_json": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "call_id",
+ "tool_name",
+ "arguments"
+ ],
+ "title": "ToolCall"
+ },
+ "ToolConfig": {
+ "type": "object",
+ "properties": {
+ "tool_choice": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [
+ "auto",
+ "required",
+ "none"
+ ],
+ "title": "ToolChoice",
+ "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
+ },
+ {
+ "type": "string"
+ }
+ ],
+ "default": "auto",
+ "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
+ },
+ "tool_prompt_format": {
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+ },
+ "system_message_behavior": {
+ "type": "string",
+ "enum": [
+ "append",
+ "replace"
+ ],
+ "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+ "default": "append"
+ }
+ },
+ "additionalProperties": false,
+ "title": "ToolConfig",
+ "description": "Configuration for tool use."
+ },
+ "ToolDefinition": {
+ "type": "object",
+ "properties": {
+ "tool_name": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ],
+ "title": "BuiltinTool"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "description": {
+ "type": "string"
+ },
+ "parameters": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ToolParamDefinition"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "tool_name"
+ ],
+ "title": "ToolDefinition"
+ },
+ "ToolParamDefinition": {
+ "type": "object",
+ "properties": {
+ "param_type": {
+ "type": "string"
+ },
+ "description": {
+ "type": "string"
+ },
+ "required": {
+ "type": "boolean",
+ "default": true
+ },
+ "default": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "param_type"
+ ],
+ "title": "ToolParamDefinition"
+ },
+ "ToolResponseMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "tool",
+ "default": "tool",
+ "description": "Must be \"tool\" to identify this as a tool response"
+ },
+ "call_id": {
+ "type": "string",
+ "description": "Unique identifier for the tool call this response is for"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The response content from the tool"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "call_id",
+ "content"
+ ],
+ "title": "ToolResponseMessage",
+ "description": "A message representing the result of a tool invocation."
+ },
+ "TopKSamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "top_k",
+ "default": "top_k",
+ "description": "Must be \"top_k\" to identify this sampling strategy"
+ },
+ "top_k": {
+ "type": "integer",
+ "description": "Number of top tokens to consider for sampling. Must be at least 1"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "top_k"
+ ],
+ "title": "TopKSamplingStrategy",
+ "description": "Top-k sampling strategy that restricts sampling to the k most likely tokens."
+ },
+ "TopPSamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "top_p",
+ "default": "top_p",
+ "description": "Must be \"top_p\" to identify this sampling strategy"
+ },
+ "temperature": {
+ "type": "number",
+ "description": "Controls randomness in sampling. Higher values increase randomness"
+ },
+ "top_p": {
+ "type": "number",
+ "default": 0.95,
+ "description": "Cumulative probability threshold for nucleus sampling. Defaults to 0.95"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "TopPSamplingStrategy",
+ "description": "Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p."
+ },
+ "URL": {
+ "type": "object",
+ "properties": {
+ "uri": {
+ "type": "string",
+ "description": "The URL string pointing to the resource"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "uri"
+ ],
+ "title": "URL",
+ "description": "A URL reference to external content."
+ },
+ "UserMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "user",
+ "default": "user",
+ "description": "Must be \"user\" to identify this as a user message"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the message, which can include text and other media"
+ },
+ "context": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content"
+ ],
+ "title": "UserMessage",
+ "description": "A message from the user in a chat conversation."
+ },
+ "ChatCompletionRequest": {
+ "type": "object",
+ "properties": {
+ "model_id": {
+ "type": "string",
+ "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
+ },
+ "messages": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Message"
+ },
+ "description": "List of messages in the conversation."
+ },
+ "sampling_params": {
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "Parameters to control the sampling strategy."
+ },
+ "tools": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ToolDefinition"
+ },
+ "description": "(Optional) List of tool definitions available to the model."
+ },
+ "tool_choice": {
+ "type": "string",
+ "enum": [
+ "auto",
+ "required",
+ "none"
+ ],
+ "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
+ },
+ "tool_prompt_format": {
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
+ },
+ "response_format": {
+ "$ref": "#/components/schemas/ResponseFormat",
+ "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
+ },
+ "stream": {
+ "type": "boolean",
+ "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
+ },
+ "logprobs": {
+ "type": "object",
+ "properties": {
+ "top_k": {
+ "type": "integer",
+ "default": 0,
+ "description": "How many tokens (for each position) to return log probabilities for."
+ }
+ },
+ "additionalProperties": false,
+ "description": "(Optional) If specified, log probabilities for each token position will be returned."
+ },
+ "tool_config": {
+ "$ref": "#/components/schemas/ToolConfig",
+ "description": "(Optional) Configuration for tool use."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model_id",
+ "messages"
+ ],
+ "title": "ChatCompletionRequest"
+ },
+ "ChatCompletionResponse": {
+ "type": "object",
+ "properties": {
+ "metrics": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/MetricInResponse"
+ },
+ "description": "(Optional) List of metrics associated with the API response"
+ },
+ "completion_message": {
+ "$ref": "#/components/schemas/CompletionMessage",
+ "description": "The complete response message"
+ },
+ "logprobs": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/TokenLogProbs"
+ },
+ "description": "Optional log probabilities for generated tokens"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "completion_message"
+ ],
+ "title": "ChatCompletionResponse",
+ "description": "Response from a chat completion request."
+ },
+ "MetricInResponse": {
+ "type": "object",
+ "properties": {
+ "metric": {
+ "type": "string",
+ "description": "The name of the metric"
+ },
+ "value": {
+ "oneOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ }
+ ],
+ "description": "The numeric value of the metric"
+ },
+ "unit": {
+ "type": "string",
+ "description": "(Optional) The unit of measurement for the metric value"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "metric",
+ "value"
+ ],
+ "title": "MetricInResponse",
+ "description": "A metric value included in API responses."
+ },
+ "TokenLogProbs": {
+ "type": "object",
+ "properties": {
+ "logprobs_by_token": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "number"
+ },
+ "description": "Dictionary mapping tokens to their log probabilities"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "logprobs_by_token"
+ ],
+ "title": "TokenLogProbs",
+ "description": "Log probabilities for generated tokens."
+ },
+ "ChatCompletionResponseEvent": {
+ "type": "object",
+ "properties": {
+ "event_type": {
+ "type": "string",
+ "enum": [
+ "start",
+ "complete",
+ "progress"
+ ],
+ "description": "Type of the event"
+ },
+ "delta": {
+ "$ref": "#/components/schemas/ContentDelta",
+ "description": "Content generated since last event. This can be one or more tokens, or a tool call."
+ },
+ "logprobs": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/TokenLogProbs"
+ },
+ "description": "Optional log probabilities for generated tokens"
+ },
+ "stop_reason": {
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Optional reason why generation stopped, if complete"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "event_type",
+ "delta"
+ ],
+ "title": "ChatCompletionResponseEvent",
+ "description": "An event during chat completion generation."
+ },
+ "ChatCompletionResponseStreamChunk": {
+ "type": "object",
+ "properties": {
+ "metrics": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/MetricInResponse"
+ },
+ "description": "(Optional) List of metrics associated with the API response"
+ },
+ "event": {
+ "$ref": "#/components/schemas/ChatCompletionResponseEvent",
+ "description": "The event containing the new content"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "event"
+ ],
+ "title": "ChatCompletionResponseStreamChunk",
+ "description": "A chunk of a streamed chat completion response."
+ },
+ "ContentDelta": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/TextDelta"
+ },
+ {
+ "$ref": "#/components/schemas/ImageDelta"
+ },
+ {
+ "$ref": "#/components/schemas/ToolCallDelta"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "text": "#/components/schemas/TextDelta",
+ "image": "#/components/schemas/ImageDelta",
+ "tool_call": "#/components/schemas/ToolCallDelta"
+ }
+ }
+ },
+ "ImageDelta": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "image",
+ "default": "image",
+ "description": "Discriminator type of the delta. Always \"image\""
+ },
+ "image": {
+ "type": "string",
+ "contentEncoding": "base64",
+ "description": "The incremental image data as bytes"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "image"
+ ],
+ "title": "ImageDelta",
+ "description": "An image content delta for streaming responses."
+ },
+ "TextDelta": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "text",
+ "default": "text",
+ "description": "Discriminator type of the delta. Always \"text\""
+ },
+ "text": {
+ "type": "string",
+ "description": "The incremental text content"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "text"
+ ],
+ "title": "TextDelta",
+ "description": "A text content delta for streaming responses."
+ },
+ "ToolCallDelta": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "tool_call",
+ "default": "tool_call",
+ "description": "Discriminator type of the delta. Always \"tool_call\""
+ },
+ "tool_call": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/ToolCall"
+ }
+ ],
+ "description": "Either an in-progress tool call string or the final parsed tool call"
+ },
+ "parse_status": {
+ "type": "string",
+ "enum": [
+ "started",
+ "in_progress",
+ "failed",
+ "succeeded"
+ ],
+ "description": "Current parsing status of the tool call"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "tool_call",
+ "parse_status"
+ ],
+ "title": "ToolCallDelta",
+ "description": "A tool call content delta for streaming responses."
+ },
"AgentConfig": {
"type": "object",
"properties": {
@@ -6265,234 +7385,6 @@
}
]
},
- "GrammarResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "enum": [
- "json_schema",
- "grammar"
- ],
- "description": "Must be \"grammar\" to identify this format type",
- "const": "grammar",
- "default": "grammar"
- },
- "bnf": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "The BNF grammar specification the response should conform to"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "bnf"
- ],
- "title": "GrammarResponseFormat",
- "description": "Configuration for grammar-guided response generation."
- },
- "GreedySamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "greedy",
- "default": "greedy",
- "description": "Must be \"greedy\" to identify this sampling strategy"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "GreedySamplingStrategy",
- "description": "Greedy sampling strategy that selects the highest probability token at each step."
- },
- "JsonSchemaResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "enum": [
- "json_schema",
- "grammar"
- ],
- "description": "Must be \"json_schema\" to identify this format type",
- "const": "json_schema",
- "default": "json_schema"
- },
- "json_schema": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "json_schema"
- ],
- "title": "JsonSchemaResponseFormat",
- "description": "Configuration for JSON schema-guided response generation."
- },
- "ResponseFormat": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/JsonSchemaResponseFormat"
- },
- {
- "$ref": "#/components/schemas/GrammarResponseFormat"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
- "grammar": "#/components/schemas/GrammarResponseFormat"
- }
- }
- },
- "SamplingParams": {
- "type": "object",
- "properties": {
- "strategy": {
- "$ref": "#/components/schemas/SamplingStrategy",
- "description": "The sampling strategy."
- },
- "max_tokens": {
- "type": "integer",
- "default": 0,
- "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
- },
- "repetition_penalty": {
- "type": "number",
- "default": 1.0,
- "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
- },
- "stop": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
- }
- },
- "additionalProperties": false,
- "required": [
- "strategy"
- ],
- "title": "SamplingParams",
- "description": "Sampling parameters."
- },
- "SamplingStrategy": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/GreedySamplingStrategy"
- },
- {
- "$ref": "#/components/schemas/TopPSamplingStrategy"
- },
- {
- "$ref": "#/components/schemas/TopKSamplingStrategy"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "greedy": "#/components/schemas/GreedySamplingStrategy",
- "top_p": "#/components/schemas/TopPSamplingStrategy",
- "top_k": "#/components/schemas/TopKSamplingStrategy"
- }
- }
- },
- "ToolConfig": {
- "type": "object",
- "properties": {
- "tool_choice": {
- "oneOf": [
- {
- "type": "string",
- "enum": [
- "auto",
- "required",
- "none"
- ],
- "title": "ToolChoice",
- "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
- },
- {
- "type": "string"
- }
- ],
- "default": "auto",
- "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
- },
- "tool_prompt_format": {
- "type": "string",
- "enum": [
- "json",
- "function_tag",
- "python_list"
- ],
- "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
- },
- "system_message_behavior": {
- "type": "string",
- "enum": [
- "append",
- "replace"
- ],
- "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
- "default": "append"
- }
- },
- "additionalProperties": false,
- "title": "ToolConfig",
- "description": "Configuration for tool use."
- },
"ToolDef": {
"type": "object",
"properties": {
@@ -6599,54 +7491,6 @@
"title": "ToolParameter",
"description": "Parameter definition for a tool."
},
- "TopKSamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "top_k",
- "default": "top_k",
- "description": "Must be \"top_k\" to identify this sampling strategy"
- },
- "top_k": {
- "type": "integer",
- "description": "Number of top tokens to consider for sampling. Must be at least 1"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "top_k"
- ],
- "title": "TopKSamplingStrategy",
- "description": "Top-k sampling strategy that restricts sampling to the k most likely tokens."
- },
- "TopPSamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "top_p",
- "default": "top_p",
- "description": "Must be \"top_p\" to identify this sampling strategy"
- },
- "temperature": {
- "type": "number",
- "description": "Controls randomness in sampling. Higher values increase randomness"
- },
- "top_p": {
- "type": "number",
- "default": 0.95,
- "description": "Cumulative probability threshold for nucleus sampling. Defaults to 0.95"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "TopPSamplingStrategy",
- "description": "Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p."
- },
"CreateAgentRequest": {
"type": "object",
"properties": {
@@ -6705,163 +7549,6 @@
"title": "AgentSessionCreateResponse",
"description": "Response returned when creating a new agent session."
},
- "ImageContentItem": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "image",
- "default": "image",
- "description": "Discriminator type of the content item. Always \"image\""
- },
- "image": {
- "type": "object",
- "properties": {
- "url": {
- "$ref": "#/components/schemas/URL",
- "description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
- },
- "data": {
- "type": "string",
- "contentEncoding": "base64",
- "description": "base64 encoded image data as string"
- }
- },
- "additionalProperties": false,
- "description": "Image as a base64 encoded string or an URL"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "image"
- ],
- "title": "ImageContentItem",
- "description": "A image content item"
- },
- "InterleavedContent": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "$ref": "#/components/schemas/InterleavedContentItem"
- },
- {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/InterleavedContentItem"
- }
- }
- ]
- },
- "InterleavedContentItem": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ImageContentItem"
- },
- {
- "$ref": "#/components/schemas/TextContentItem"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "image": "#/components/schemas/ImageContentItem",
- "text": "#/components/schemas/TextContentItem"
- }
- }
- },
- "TextContentItem": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "text",
- "default": "text",
- "description": "Discriminator type of the content item. Always \"text\""
- },
- "text": {
- "type": "string",
- "description": "Text content"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "text"
- ],
- "title": "TextContentItem",
- "description": "A text content item"
- },
- "ToolResponseMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "tool",
- "default": "tool",
- "description": "Must be \"tool\" to identify this as a tool response"
- },
- "call_id": {
- "type": "string",
- "description": "Unique identifier for the tool call this response is for"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The response content from the tool"
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "call_id",
- "content"
- ],
- "title": "ToolResponseMessage",
- "description": "A message representing the result of a tool invocation."
- },
- "URL": {
- "type": "object",
- "properties": {
- "uri": {
- "type": "string",
- "description": "The URL string pointing to the resource"
- }
- },
- "additionalProperties": false,
- "required": [
- "uri"
- ],
- "title": "URL",
- "description": "A URL reference to external content."
- },
- "UserMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "user",
- "default": "user",
- "description": "Must be \"user\" to identify this as a user message"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the message, which can include text and other media"
- },
- "context": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "content"
- ],
- "title": "UserMessage",
- "description": "A message from the user in a chat conversation."
- },
"CreateAgentTurnRequest": {
"type": "object",
"properties": {
@@ -6941,45 +7628,6 @@
],
"title": "CreateAgentTurnRequest"
},
- "CompletionMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "assistant",
- "default": "assistant",
- "description": "Must be \"assistant\" to identify this as the model's response"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the model's response"
- },
- "stop_reason": {
- "type": "string",
- "enum": [
- "end_of_turn",
- "end_of_message",
- "out_of_tokens"
- ],
- "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
- },
- "tool_calls": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolCall"
- },
- "description": "List of tool calls. Each tool call is a ToolCall object."
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "content",
- "stop_reason"
- ],
- "title": "CompletionMessage",
- "description": "A message containing the model's (assistant) response in a chat conversation."
- },
"InferenceStep": {
"type": "object",
"properties": {
@@ -7177,114 +7825,6 @@
"title": "ShieldCallStep",
"description": "A shield call step in an agent turn."
},
- "ToolCall": {
- "type": "object",
- "properties": {
- "call_id": {
- "type": "string"
- },
- "tool_name": {
- "oneOf": [
- {
- "type": "string",
- "enum": [
- "brave_search",
- "wolfram_alpha",
- "photogen",
- "code_interpreter"
- ],
- "title": "BuiltinTool"
- },
- {
- "type": "string"
- }
- ]
- },
- "arguments": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- },
- {
- "type": "array",
- "items": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- }
- ]
- }
- },
- {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- ]
- }
- }
- ]
- },
- "arguments_json": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "call_id",
- "tool_name",
- "arguments"
- ],
- "title": "ToolCall"
- },
"ToolExecutionStep": {
"type": "object",
"properties": {
@@ -7875,112 +8415,6 @@
"title": "AgentTurnResponseTurnStartPayload",
"description": "Payload for turn start events in agent turn responses."
},
- "ContentDelta": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/TextDelta"
- },
- {
- "$ref": "#/components/schemas/ImageDelta"
- },
- {
- "$ref": "#/components/schemas/ToolCallDelta"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "text": "#/components/schemas/TextDelta",
- "image": "#/components/schemas/ImageDelta",
- "tool_call": "#/components/schemas/ToolCallDelta"
- }
- }
- },
- "ImageDelta": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "image",
- "default": "image",
- "description": "Discriminator type of the delta. Always \"image\""
- },
- "image": {
- "type": "string",
- "contentEncoding": "base64",
- "description": "The incremental image data as bytes"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "image"
- ],
- "title": "ImageDelta",
- "description": "An image content delta for streaming responses."
- },
- "TextDelta": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "text",
- "default": "text",
- "description": "Discriminator type of the delta. Always \"text\""
- },
- "text": {
- "type": "string",
- "description": "The incremental text content"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "text"
- ],
- "title": "TextDelta",
- "description": "A text content delta for streaming responses."
- },
- "ToolCallDelta": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "tool_call",
- "default": "tool_call",
- "description": "Discriminator type of the delta. Always \"tool_call\""
- },
- "tool_call": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "$ref": "#/components/schemas/ToolCall"
- }
- ],
- "description": "Either an in-progress tool call string or the final parsed tool call"
- },
- "parse_status": {
- "type": "string",
- "enum": [
- "started",
- "in_progress",
- "failed",
- "succeeded"
- ],
- "description": "Current parsing status of the tool call"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "tool_call",
- "parse_status"
- ],
- "title": "ToolCallDelta",
- "description": "A tool call content delta for streaming responses."
- },
"OpenAIResponseAnnotationCitation": {
"type": "object",
"properties": {
@@ -10080,6 +10514,80 @@
"title": "OpenAIDeleteResponseObject",
"description": "Response object confirming deletion of an OpenAI response."
},
+ "EmbeddingsRequest": {
+ "type": "object",
+ "properties": {
+ "model_id": {
+ "type": "string",
+ "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+ },
+ "contents": {
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/InterleavedContentItem"
+ }
+ }
+ ],
+ "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
+ },
+ "text_truncation": {
+ "type": "string",
+ "enum": [
+ "none",
+ "start",
+ "end"
+ ],
+ "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
+ },
+ "output_dimension": {
+ "type": "integer",
+ "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
+ },
+ "task_type": {
+ "type": "string",
+ "enum": [
+ "query",
+ "document"
+ ],
+ "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model_id",
+ "contents"
+ ],
+ "title": "EmbeddingsRequest"
+ },
+ "EmbeddingsResponse": {
+ "type": "object",
+ "properties": {
+ "embeddings": {
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "embeddings"
+ ],
+ "title": "EmbeddingsResponse",
+ "description": "Response containing generated embeddings."
+ },
"AgentCandidate": {
"type": "object",
"properties": {
@@ -10318,28 +10826,6 @@
"title": "ScoringFnParamsType",
"description": "Types of scoring function parameter configurations."
},
- "SystemMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "system",
- "default": "system",
- "description": "Must be \"system\" to identify this as a system message"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "content"
- ],
- "title": "SystemMessage",
- "description": "A system message providing instructions or context to the model."
- },
"EvaluateRowsRequest": {
"type": "object",
"properties": {
@@ -17260,31 +17746,6 @@
"title": "ModerationObjectResults",
"description": "A moderation object."
},
- "Message": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
- },
- {
- "$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
- }
- ],
- "discriminator": {
- "propertyName": "role",
- "mapping": {
- "user": "#/components/schemas/UserMessage",
- "system": "#/components/schemas/SystemMessage",
- "tool": "#/components/schemas/ToolResponseMessage",
- "assistant": "#/components/schemas/CompletionMessage"
- }
- }
- },
"RunShieldRequest": {
"type": "object",
"properties": {
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 5cf54da79..ad1329c2e 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -95,6 +95,43 @@ paths:
schema:
$ref: '#/components/schemas/CancelTrainingJobRequest'
required: true
+ /v1/inference/chat-completion:
+ post:
+ responses:
+ '200':
+ description: >-
+ If stream=False, returns a ChatCompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ChatCompletionResponse'
+ text/event-stream:
+ schema:
+ $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Inference
+ summary: >-
+ Generate a chat completion for the given messages using the specified model.
+ description: >-
+ Generate a chat completion for the given messages using the specified model.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ChatCompletionRequest'
+ required: true
/v1/agents:
get:
responses:
@@ -683,6 +720,41 @@ paths:
required: true
schema:
type: string
+ /v1/inference/embeddings:
+ post:
+ responses:
+ '200':
+ description: >-
+ An array of embeddings, one for each content. Each embedding is a list
+ of floats. The dimensionality of the embedding is model-specific; you
+ can check model metadata using /models/{model_id}.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EmbeddingsResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Inference
+ summary: >-
+ Generate embeddings for content pieces using the specified model.
+ description: >-
+ Generate embeddings for content pieces using the specified model.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EmbeddingsRequest'
+ required: true
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post:
responses:
@@ -4394,6 +4466,795 @@ components:
required:
- job_uuid
title: CancelTrainingJobRequest
+ CompletionMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: assistant
+ default: assistant
+ description: >-
+ Must be "assistant" to identify this as the model's response
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: The content of the model's response
+ stop_reason:
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+ The model finished generating the entire response. - `StopReason.end_of_message`:
+ The model finished generating but generated a partial response -- usually,
+ a tool call. The user may call the tool and continue the conversation
+ with the tool's response. - `StopReason.out_of_tokens`: The model ran
+ out of token budget.
+ tool_calls:
+ type: array
+ items:
+ $ref: '#/components/schemas/ToolCall'
+ description: >-
+ List of tool calls. Each tool call is a ToolCall object.
+ additionalProperties: false
+ required:
+ - role
+ - content
+ - stop_reason
+ title: CompletionMessage
+ description: >-
+ A message containing the model's (assistant) response in a chat conversation.
+ GrammarResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ enum:
+ - json_schema
+ - grammar
+ description: >-
+ Must be "grammar" to identify this format type
+ const: grammar
+ default: grammar
+ bnf:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The BNF grammar specification the response should conform to
+ additionalProperties: false
+ required:
+ - type
+ - bnf
+ title: GrammarResponseFormat
+ description: >-
+ Configuration for grammar-guided response generation.
+ GreedySamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: greedy
+ default: greedy
+ description: >-
+ Must be "greedy" to identify this sampling strategy
+ additionalProperties: false
+ required:
+ - type
+ title: GreedySamplingStrategy
+ description: >-
+ Greedy sampling strategy that selects the highest probability token at each
+ step.
+ ImageContentItem:
+ type: object
+ properties:
+ type:
+ type: string
+ const: image
+ default: image
+ description: >-
+ Discriminator type of the content item. Always "image"
+ image:
+ type: object
+ properties:
+ url:
+ $ref: '#/components/schemas/URL'
+ description: >-
+ A URL of the image or data URL in the format of data:image/{type};base64,{data}.
+ Note that URL could have length limits.
+ data:
+ type: string
+ contentEncoding: base64
+ description: base64 encoded image data as string
+ additionalProperties: false
+ description: >-
+ Image as a base64 encoded string or an URL
+ additionalProperties: false
+ required:
+ - type
+ - image
+ title: ImageContentItem
+ description: A image content item
+ InterleavedContent:
+ oneOf:
+ - type: string
+ - $ref: '#/components/schemas/InterleavedContentItem'
+ - type: array
+ items:
+ $ref: '#/components/schemas/InterleavedContentItem'
+ InterleavedContentItem:
+ oneOf:
+ - $ref: '#/components/schemas/ImageContentItem'
+ - $ref: '#/components/schemas/TextContentItem'
+ discriminator:
+ propertyName: type
+ mapping:
+ image: '#/components/schemas/ImageContentItem'
+ text: '#/components/schemas/TextContentItem'
+ JsonSchemaResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ enum:
+ - json_schema
+ - grammar
+ description: >-
+ Must be "json_schema" to identify this format type
+ const: json_schema
+ default: json_schema
+ json_schema:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The JSON schema the response should conform to. In a Python SDK, this
+ is often a `pydantic` model.
+ additionalProperties: false
+ required:
+ - type
+ - json_schema
+ title: JsonSchemaResponseFormat
+ description: >-
+ Configuration for JSON schema-guided response generation.
+ Message:
+ oneOf:
+ - $ref: '#/components/schemas/UserMessage'
+ - $ref: '#/components/schemas/SystemMessage'
+ - $ref: '#/components/schemas/ToolResponseMessage'
+ - $ref: '#/components/schemas/CompletionMessage'
+ discriminator:
+ propertyName: role
+ mapping:
+ user: '#/components/schemas/UserMessage'
+ system: '#/components/schemas/SystemMessage'
+ tool: '#/components/schemas/ToolResponseMessage'
+ assistant: '#/components/schemas/CompletionMessage'
+ ResponseFormat:
+ oneOf:
+ - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+ - $ref: '#/components/schemas/GrammarResponseFormat'
+ discriminator:
+ propertyName: type
+ mapping:
+ json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+ grammar: '#/components/schemas/GrammarResponseFormat'
+ SamplingParams:
+ type: object
+ properties:
+ strategy:
+ $ref: '#/components/schemas/SamplingStrategy'
+ description: The sampling strategy.
+ max_tokens:
+ type: integer
+ default: 0
+ description: >-
+ The maximum number of tokens that can be generated in the completion.
+ The token count of your prompt plus max_tokens cannot exceed the model's
+ context length.
+ repetition_penalty:
+ type: number
+ default: 1.0
+ description: >-
+ Number between -2.0 and 2.0. Positive values penalize new tokens based
+ on whether they appear in the text so far, increasing the model's likelihood
+ to talk about new topics.
+ stop:
+ type: array
+ items:
+ type: string
+ description: >-
+ Up to 4 sequences where the API will stop generating further tokens. The
+ returned text will not contain the stop sequence.
+ additionalProperties: false
+ required:
+ - strategy
+ title: SamplingParams
+ description: Sampling parameters.
+ SamplingStrategy:
+ oneOf:
+ - $ref: '#/components/schemas/GreedySamplingStrategy'
+ - $ref: '#/components/schemas/TopPSamplingStrategy'
+ - $ref: '#/components/schemas/TopKSamplingStrategy'
+ discriminator:
+ propertyName: type
+ mapping:
+ greedy: '#/components/schemas/GreedySamplingStrategy'
+ top_p: '#/components/schemas/TopPSamplingStrategy'
+ top_k: '#/components/schemas/TopKSamplingStrategy'
+ SystemMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: system
+ default: system
+ description: >-
+ Must be "system" to identify this as a system message
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ The content of the "system prompt". If multiple system messages are provided,
+ they are concatenated. The underlying Llama Stack code may also add other
+ system messages (for example, for formatting tool definitions).
+ additionalProperties: false
+ required:
+ - role
+ - content
+ title: SystemMessage
+ description: >-
+ A system message providing instructions or context to the model.
+ TextContentItem:
+ type: object
+ properties:
+ type:
+ type: string
+ const: text
+ default: text
+ description: >-
+ Discriminator type of the content item. Always "text"
+ text:
+ type: string
+ description: Text content
+ additionalProperties: false
+ required:
+ - type
+ - text
+ title: TextContentItem
+ description: A text content item
+ ToolCall:
+ type: object
+ properties:
+ call_id:
+ type: string
+ tool_name:
+ oneOf:
+ - type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
+ title: BuiltinTool
+ - type: string
+ arguments:
+ oneOf:
+ - type: string
+ - type: object
+ additionalProperties:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ - type: array
+ items:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ - type: object
+ additionalProperties:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ arguments_json:
+ type: string
+ additionalProperties: false
+ required:
+ - call_id
+ - tool_name
+ - arguments
+ title: ToolCall
+ ToolConfig:
+ type: object
+ properties:
+ tool_choice:
+ oneOf:
+ - type: string
+ enum:
+ - auto
+ - required
+ - none
+ title: ToolChoice
+ description: >-
+ Whether tool use is required or automatic. This is a hint to the model
+ which may not be followed. It depends on the Instruction Following
+ capabilities of the model.
+ - type: string
+ default: auto
+ description: >-
+ (Optional) Whether tool use is automatic, required, or none. Can also
+ specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
+ tool_prompt_format:
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ description: >-
+ (Optional) Instructs the model how to format tool calls. By default, Llama
+ Stack will attempt to use a format that is best adapted to the model.
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
+ tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+ syntax -- a list of function calls.
+ system_message_behavior:
+ type: string
+ enum:
+ - append
+ - replace
+ description: >-
+ (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+ Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+ Replaces the default system prompt with the provided system message. The
+ system message can include the string '{{function_definitions}}' to indicate
+ where the function definitions should be inserted.
+ default: append
+ additionalProperties: false
+ title: ToolConfig
+ description: Configuration for tool use.
+ ToolDefinition:
+ type: object
+ properties:
+ tool_name:
+ oneOf:
+ - type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
+ title: BuiltinTool
+ - type: string
+ description:
+ type: string
+ parameters:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ToolParamDefinition'
+ additionalProperties: false
+ required:
+ - tool_name
+ title: ToolDefinition
+ ToolParamDefinition:
+ type: object
+ properties:
+ param_type:
+ type: string
+ description:
+ type: string
+ required:
+ type: boolean
+ default: true
+ default:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - param_type
+ title: ToolParamDefinition
+ ToolResponseMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: tool
+ default: tool
+ description: >-
+ Must be "tool" to identify this as a tool response
+ call_id:
+ type: string
+ description: >-
+ Unique identifier for the tool call this response is for
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: The response content from the tool
+ additionalProperties: false
+ required:
+ - role
+ - call_id
+ - content
+ title: ToolResponseMessage
+ description: >-
+ A message representing the result of a tool invocation.
+ TopKSamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: top_k
+ default: top_k
+ description: >-
+ Must be "top_k" to identify this sampling strategy
+ top_k:
+ type: integer
+ description: >-
+ Number of top tokens to consider for sampling. Must be at least 1
+ additionalProperties: false
+ required:
+ - type
+ - top_k
+ title: TopKSamplingStrategy
+ description: >-
+ Top-k sampling strategy that restricts sampling to the k most likely tokens.
+ TopPSamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: top_p
+ default: top_p
+ description: >-
+ Must be "top_p" to identify this sampling strategy
+ temperature:
+ type: number
+ description: >-
+ Controls randomness in sampling. Higher values increase randomness
+ top_p:
+ type: number
+ default: 0.95
+ description: >-
+ Cumulative probability threshold for nucleus sampling. Defaults to 0.95
+ additionalProperties: false
+ required:
+ - type
+ title: TopPSamplingStrategy
+ description: >-
+ Top-p (nucleus) sampling strategy that samples from the smallest set of tokens
+ with cumulative probability >= p.
+ URL:
+ type: object
+ properties:
+ uri:
+ type: string
+ description: The URL string pointing to the resource
+ additionalProperties: false
+ required:
+ - uri
+ title: URL
+ description: A URL reference to external content.
+ UserMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: user
+ default: user
+ description: >-
+ Must be "user" to identify this as a user message
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ The content of the message, which can include text and other media
+ context:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ (Optional) This field is used internally by Llama Stack to pass RAG context.
+ This field may be removed in the API in the future.
+ additionalProperties: false
+ required:
+ - role
+ - content
+ title: UserMessage
+ description: >-
+ A message from the user in a chat conversation.
+ ChatCompletionRequest:
+ type: object
+ properties:
+ model_id:
+ type: string
+ description: >-
+ The identifier of the model to use. The model must be registered with
+ Llama Stack and available via the /models endpoint.
+ messages:
+ type: array
+ items:
+ $ref: '#/components/schemas/Message'
+ description: List of messages in the conversation.
+ sampling_params:
+ $ref: '#/components/schemas/SamplingParams'
+ description: >-
+ Parameters to control the sampling strategy.
+ tools:
+ type: array
+ items:
+ $ref: '#/components/schemas/ToolDefinition'
+ description: >-
+ (Optional) List of tool definitions available to the model.
+ tool_choice:
+ type: string
+ enum:
+ - auto
+ - required
+ - none
+ description: >-
+ (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+ .. deprecated:: Use tool_config instead.
+ tool_prompt_format:
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ description: >-
+ (Optional) Instructs the model how to format tool calls. By default, Llama
+ Stack will attempt to use a format that is best adapted to the model.
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
+ tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+ syntax -- a list of function calls. .. deprecated:: Use tool_config instead.
+ response_format:
+ $ref: '#/components/schemas/ResponseFormat'
+ description: >-
+ (Optional) Grammar specification for guided (structured) decoding. There
+ are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
+ schema. Most providers support this format. - `ResponseFormat.grammar`:
+ The grammar is a BNF grammar. This format is more flexible, but not all
+ providers support it.
+ stream:
+ type: boolean
+ description: >-
+ (Optional) If True, generate an SSE event stream of the response. Defaults
+ to False.
+ logprobs:
+ type: object
+ properties:
+ top_k:
+ type: integer
+ default: 0
+ description: >-
+ How many tokens (for each position) to return log probabilities for.
+ additionalProperties: false
+ description: >-
+ (Optional) If specified, log probabilities for each token position will
+ be returned.
+ tool_config:
+ $ref: '#/components/schemas/ToolConfig'
+ description: (Optional) Configuration for tool use.
+ additionalProperties: false
+ required:
+ - model_id
+ - messages
+ title: ChatCompletionRequest
+ ChatCompletionResponse:
+ type: object
+ properties:
+ metrics:
+ type: array
+ items:
+ $ref: '#/components/schemas/MetricInResponse'
+ description: >-
+ (Optional) List of metrics associated with the API response
+ completion_message:
+ $ref: '#/components/schemas/CompletionMessage'
+ description: The complete response message
+ logprobs:
+ type: array
+ items:
+ $ref: '#/components/schemas/TokenLogProbs'
+ description: >-
+ Optional log probabilities for generated tokens
+ additionalProperties: false
+ required:
+ - completion_message
+ title: ChatCompletionResponse
+ description: Response from a chat completion request.
+ MetricInResponse:
+ type: object
+ properties:
+ metric:
+ type: string
+ description: The name of the metric
+ value:
+ oneOf:
+ - type: integer
+ - type: number
+ description: The numeric value of the metric
+ unit:
+ type: string
+ description: >-
+ (Optional) The unit of measurement for the metric value
+ additionalProperties: false
+ required:
+ - metric
+ - value
+ title: MetricInResponse
+ description: >-
+ A metric value included in API responses.
+ TokenLogProbs:
+ type: object
+ properties:
+ logprobs_by_token:
+ type: object
+ additionalProperties:
+ type: number
+ description: >-
+ Dictionary mapping tokens to their log probabilities
+ additionalProperties: false
+ required:
+ - logprobs_by_token
+ title: TokenLogProbs
+ description: Log probabilities for generated tokens.
+ ChatCompletionResponseEvent:
+ type: object
+ properties:
+ event_type:
+ type: string
+ enum:
+ - start
+ - complete
+ - progress
+ description: Type of the event
+ delta:
+ $ref: '#/components/schemas/ContentDelta'
+ description: >-
+ Content generated since last event. This can be one or more tokens, or
+ a tool call.
+ logprobs:
+ type: array
+ items:
+ $ref: '#/components/schemas/TokenLogProbs'
+ description: >-
+ Optional log probabilities for generated tokens
+ stop_reason:
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Optional reason why generation stopped, if complete
+ additionalProperties: false
+ required:
+ - event_type
+ - delta
+ title: ChatCompletionResponseEvent
+ description: >-
+ An event during chat completion generation.
+ ChatCompletionResponseStreamChunk:
+ type: object
+ properties:
+ metrics:
+ type: array
+ items:
+ $ref: '#/components/schemas/MetricInResponse'
+ description: >-
+ (Optional) List of metrics associated with the API response
+ event:
+ $ref: '#/components/schemas/ChatCompletionResponseEvent'
+ description: The event containing the new content
+ additionalProperties: false
+ required:
+ - event
+ title: ChatCompletionResponseStreamChunk
+ description: >-
+ A chunk of a streamed chat completion response.
+ ContentDelta:
+ oneOf:
+ - $ref: '#/components/schemas/TextDelta'
+ - $ref: '#/components/schemas/ImageDelta'
+ - $ref: '#/components/schemas/ToolCallDelta'
+ discriminator:
+ propertyName: type
+ mapping:
+ text: '#/components/schemas/TextDelta'
+ image: '#/components/schemas/ImageDelta'
+ tool_call: '#/components/schemas/ToolCallDelta'
+ ImageDelta:
+ type: object
+ properties:
+ type:
+ type: string
+ const: image
+ default: image
+ description: >-
+ Discriminator type of the delta. Always "image"
+ image:
+ type: string
+ contentEncoding: base64
+ description: The incremental image data as bytes
+ additionalProperties: false
+ required:
+ - type
+ - image
+ title: ImageDelta
+ description: >-
+ An image content delta for streaming responses.
+ TextDelta:
+ type: object
+ properties:
+ type:
+ type: string
+ const: text
+ default: text
+ description: >-
+ Discriminator type of the delta. Always "text"
+ text:
+ type: string
+ description: The incremental text content
+ additionalProperties: false
+ required:
+ - type
+ - text
+ title: TextDelta
+ description: >-
+ A text content delta for streaming responses.
+ ToolCallDelta:
+ type: object
+ properties:
+ type:
+ type: string
+ const: tool_call
+ default: tool_call
+ description: >-
+ Discriminator type of the delta. Always "tool_call"
+ tool_call:
+ oneOf:
+ - type: string
+ - $ref: '#/components/schemas/ToolCall'
+ description: >-
+ Either an in-progress tool call string or the final parsed tool call
+ parse_status:
+ type: string
+ enum:
+ - started
+ - in_progress
+ - failed
+ - succeeded
+ description: Current parsing status of the tool call
+ additionalProperties: false
+ required:
+ - type
+ - tool_call
+ - parse_status
+ title: ToolCallDelta
+ description: >-
+ A tool call content delta for streaming responses.
AgentConfig:
type: object
properties:
@@ -4489,185 +5350,6 @@ components:
- name
- args
title: AgentToolGroupWithArgs
- GrammarResponseFormat:
- type: object
- properties:
- type:
- type: string
- enum:
- - json_schema
- - grammar
- description: >-
- Must be "grammar" to identify this format type
- const: grammar
- default: grammar
- bnf:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The BNF grammar specification the response should conform to
- additionalProperties: false
- required:
- - type
- - bnf
- title: GrammarResponseFormat
- description: >-
- Configuration for grammar-guided response generation.
- GreedySamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: greedy
- default: greedy
- description: >-
- Must be "greedy" to identify this sampling strategy
- additionalProperties: false
- required:
- - type
- title: GreedySamplingStrategy
- description: >-
- Greedy sampling strategy that selects the highest probability token at each
- step.
- JsonSchemaResponseFormat:
- type: object
- properties:
- type:
- type: string
- enum:
- - json_schema
- - grammar
- description: >-
- Must be "json_schema" to identify this format type
- const: json_schema
- default: json_schema
- json_schema:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The JSON schema the response should conform to. In a Python SDK, this
- is often a `pydantic` model.
- additionalProperties: false
- required:
- - type
- - json_schema
- title: JsonSchemaResponseFormat
- description: >-
- Configuration for JSON schema-guided response generation.
- ResponseFormat:
- oneOf:
- - $ref: '#/components/schemas/JsonSchemaResponseFormat'
- - $ref: '#/components/schemas/GrammarResponseFormat'
- discriminator:
- propertyName: type
- mapping:
- json_schema: '#/components/schemas/JsonSchemaResponseFormat'
- grammar: '#/components/schemas/GrammarResponseFormat'
- SamplingParams:
- type: object
- properties:
- strategy:
- $ref: '#/components/schemas/SamplingStrategy'
- description: The sampling strategy.
- max_tokens:
- type: integer
- default: 0
- description: >-
- The maximum number of tokens that can be generated in the completion.
- The token count of your prompt plus max_tokens cannot exceed the model's
- context length.
- repetition_penalty:
- type: number
- default: 1.0
- description: >-
- Number between -2.0 and 2.0. Positive values penalize new tokens based
- on whether they appear in the text so far, increasing the model's likelihood
- to talk about new topics.
- stop:
- type: array
- items:
- type: string
- description: >-
- Up to 4 sequences where the API will stop generating further tokens. The
- returned text will not contain the stop sequence.
- additionalProperties: false
- required:
- - strategy
- title: SamplingParams
- description: Sampling parameters.
- SamplingStrategy:
- oneOf:
- - $ref: '#/components/schemas/GreedySamplingStrategy'
- - $ref: '#/components/schemas/TopPSamplingStrategy'
- - $ref: '#/components/schemas/TopKSamplingStrategy'
- discriminator:
- propertyName: type
- mapping:
- greedy: '#/components/schemas/GreedySamplingStrategy'
- top_p: '#/components/schemas/TopPSamplingStrategy'
- top_k: '#/components/schemas/TopKSamplingStrategy'
- ToolConfig:
- type: object
- properties:
- tool_choice:
- oneOf:
- - type: string
- enum:
- - auto
- - required
- - none
- title: ToolChoice
- description: >-
- Whether tool use is required or automatic. This is a hint to the model
- which may not be followed. It depends on the Instruction Following
- capabilities of the model.
- - type: string
- default: auto
- description: >-
- (Optional) Whether tool use is automatic, required, or none. Can also
- specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
- tool_prompt_format:
- type: string
- enum:
- - json
- - function_tag
- - python_list
- description: >-
- (Optional) Instructs the model how to format tool calls. By default, Llama
- Stack will attempt to use a format that is best adapted to the model.
- - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
- syntax -- a list of function calls.
- system_message_behavior:
- type: string
- enum:
- - append
- - replace
- description: >-
- (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
- Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
- Replaces the default system prompt with the provided system message. The
- system message can include the string '{{function_definitions}}' to indicate
- where the function definitions should be inserted.
- default: append
- additionalProperties: false
- title: ToolConfig
- description: Configuration for tool use.
ToolDef:
type: object
properties:
@@ -4739,51 +5421,6 @@ components:
- required
title: ToolParameter
description: Parameter definition for a tool.
- TopKSamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: top_k
- default: top_k
- description: >-
- Must be "top_k" to identify this sampling strategy
- top_k:
- type: integer
- description: >-
- Number of top tokens to consider for sampling. Must be at least 1
- additionalProperties: false
- required:
- - type
- - top_k
- title: TopKSamplingStrategy
- description: >-
- Top-k sampling strategy that restricts sampling to the k most likely tokens.
- TopPSamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: top_p
- default: top_p
- description: >-
- Must be "top_p" to identify this sampling strategy
- temperature:
- type: number
- description: >-
- Controls randomness in sampling. Higher values increase randomness
- top_p:
- type: number
- default: 0.95
- description: >-
- Cumulative probability threshold for nucleus sampling. Defaults to 0.95
- additionalProperties: false
- required:
- - type
- title: TopPSamplingStrategy
- description: >-
- Top-p (nucleus) sampling strategy that samples from the smallest set of tokens
- with cumulative probability >= p.
CreateAgentRequest:
type: object
properties:
@@ -4829,130 +5466,6 @@ components:
title: AgentSessionCreateResponse
description: >-
Response returned when creating a new agent session.
- ImageContentItem:
- type: object
- properties:
- type:
- type: string
- const: image
- default: image
- description: >-
- Discriminator type of the content item. Always "image"
- image:
- type: object
- properties:
- url:
- $ref: '#/components/schemas/URL'
- description: >-
- A URL of the image or data URL in the format of data:image/{type};base64,{data}.
- Note that URL could have length limits.
- data:
- type: string
- contentEncoding: base64
- description: base64 encoded image data as string
- additionalProperties: false
- description: >-
- Image as a base64 encoded string or an URL
- additionalProperties: false
- required:
- - type
- - image
- title: ImageContentItem
- description: A image content item
- InterleavedContent:
- oneOf:
- - type: string
- - $ref: '#/components/schemas/InterleavedContentItem'
- - type: array
- items:
- $ref: '#/components/schemas/InterleavedContentItem'
- InterleavedContentItem:
- oneOf:
- - $ref: '#/components/schemas/ImageContentItem'
- - $ref: '#/components/schemas/TextContentItem'
- discriminator:
- propertyName: type
- mapping:
- image: '#/components/schemas/ImageContentItem'
- text: '#/components/schemas/TextContentItem'
- TextContentItem:
- type: object
- properties:
- type:
- type: string
- const: text
- default: text
- description: >-
- Discriminator type of the content item. Always "text"
- text:
- type: string
- description: Text content
- additionalProperties: false
- required:
- - type
- - text
- title: TextContentItem
- description: A text content item
- ToolResponseMessage:
- type: object
- properties:
- role:
- type: string
- const: tool
- default: tool
- description: >-
- Must be "tool" to identify this as a tool response
- call_id:
- type: string
- description: >-
- Unique identifier for the tool call this response is for
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: The response content from the tool
- additionalProperties: false
- required:
- - role
- - call_id
- - content
- title: ToolResponseMessage
- description: >-
- A message representing the result of a tool invocation.
- URL:
- type: object
- properties:
- uri:
- type: string
- description: The URL string pointing to the resource
- additionalProperties: false
- required:
- - uri
- title: URL
- description: A URL reference to external content.
- UserMessage:
- type: object
- properties:
- role:
- type: string
- const: user
- default: user
- description: >-
- Must be "user" to identify this as a user message
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: >-
- The content of the message, which can include text and other media
- context:
- $ref: '#/components/schemas/InterleavedContent'
- description: >-
- (Optional) This field is used internally by Llama Stack to pass RAG context.
- This field may be removed in the API in the future.
- additionalProperties: false
- required:
- - role
- - content
- title: UserMessage
- description: >-
- A message from the user in a chat conversation.
CreateAgentTurnRequest:
type: object
properties:
@@ -5009,45 +5522,6 @@ components:
required:
- messages
title: CreateAgentTurnRequest
- CompletionMessage:
- type: object
- properties:
- role:
- type: string
- const: assistant
- default: assistant
- description: >-
- Must be "assistant" to identify this as the model's response
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: The content of the model's response
- stop_reason:
- type: string
- enum:
- - end_of_turn
- - end_of_message
- - out_of_tokens
- description: >-
- Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
- The model finished generating the entire response. - `StopReason.end_of_message`:
- The model finished generating but generated a partial response -- usually,
- a tool call. The user may call the tool and continue the conversation
- with the tool's response. - `StopReason.out_of_tokens`: The model ran
- out of token budget.
- tool_calls:
- type: array
- items:
- $ref: '#/components/schemas/ToolCall'
- description: >-
- List of tool calls. Each tool call is a ToolCall object.
- additionalProperties: false
- required:
- - role
- - content
- - stop_reason
- title: CompletionMessage
- description: >-
- A message containing the model's (assistant) response in a chat conversation.
InferenceStep:
type: object
properties:
@@ -5201,56 +5675,6 @@ components:
- step_type
title: ShieldCallStep
description: A shield call step in an agent turn.
- ToolCall:
- type: object
- properties:
- call_id:
- type: string
- tool_name:
- oneOf:
- - type: string
- enum:
- - brave_search
- - wolfram_alpha
- - photogen
- - code_interpreter
- title: BuiltinTool
- - type: string
- arguments:
- oneOf:
- - type: string
- - type: object
- additionalProperties:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- - type: array
- items:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- - type: object
- additionalProperties:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- arguments_json:
- type: string
- additionalProperties: false
- required:
- - call_id
- - tool_name
- - arguments
- title: ToolCall
ToolExecutionStep:
type: object
properties:
@@ -5689,87 +6113,6 @@ components:
title: AgentTurnResponseTurnStartPayload
description: >-
Payload for turn start events in agent turn responses.
- ContentDelta:
- oneOf:
- - $ref: '#/components/schemas/TextDelta'
- - $ref: '#/components/schemas/ImageDelta'
- - $ref: '#/components/schemas/ToolCallDelta'
- discriminator:
- propertyName: type
- mapping:
- text: '#/components/schemas/TextDelta'
- image: '#/components/schemas/ImageDelta'
- tool_call: '#/components/schemas/ToolCallDelta'
- ImageDelta:
- type: object
- properties:
- type:
- type: string
- const: image
- default: image
- description: >-
- Discriminator type of the delta. Always "image"
- image:
- type: string
- contentEncoding: base64
- description: The incremental image data as bytes
- additionalProperties: false
- required:
- - type
- - image
- title: ImageDelta
- description: >-
- An image content delta for streaming responses.
- TextDelta:
- type: object
- properties:
- type:
- type: string
- const: text
- default: text
- description: >-
- Discriminator type of the delta. Always "text"
- text:
- type: string
- description: The incremental text content
- additionalProperties: false
- required:
- - type
- - text
- title: TextDelta
- description: >-
- A text content delta for streaming responses.
- ToolCallDelta:
- type: object
- properties:
- type:
- type: string
- const: tool_call
- default: tool_call
- description: >-
- Discriminator type of the delta. Always "tool_call"
- tool_call:
- oneOf:
- - type: string
- - $ref: '#/components/schemas/ToolCall'
- description: >-
- Either an in-progress tool call string or the final parsed tool call
- parse_status:
- type: string
- enum:
- - started
- - in_progress
- - failed
- - succeeded
- description: Current parsing status of the tool call
- additionalProperties: false
- required:
- - type
- - tool_call
- - parse_status
- title: ToolCallDelta
- description: >-
- A tool call content delta for streaming responses.
OpenAIResponseAnnotationCitation:
type: object
properties:
@@ -7435,6 +7778,72 @@ components:
title: OpenAIDeleteResponseObject
description: >-
Response object confirming deletion of an OpenAI response.
+ EmbeddingsRequest:
+ type: object
+ properties:
+ model_id:
+ type: string
+ description: >-
+ The identifier of the model to use. The model must be an embedding model
+ registered with Llama Stack and available via the /models endpoint.
+ contents:
+ oneOf:
+ - type: array
+ items:
+ type: string
+ - type: array
+ items:
+ $ref: '#/components/schemas/InterleavedContentItem'
+ description: >-
+ List of contents to generate embeddings for. Each content can be a string
+ or an InterleavedContentItem (and hence can be multimodal). The behavior
+ depends on the model and provider. Some models may only support text.
+ text_truncation:
+ type: string
+ enum:
+ - none
+ - start
+ - end
+ description: >-
+ (Optional) Config for how to truncate text for embedding when text is
+ longer than the model's max sequence length.
+ output_dimension:
+ type: integer
+ description: >-
+ (Optional) Output dimensionality for the embeddings. Only supported by
+ Matryoshka models.
+ task_type:
+ type: string
+ enum:
+ - query
+ - document
+ description: >-
+ (Optional) How is the embedding being used? This is only supported by
+ asymmetric embedding models.
+ additionalProperties: false
+ required:
+ - model_id
+ - contents
+ title: EmbeddingsRequest
+ EmbeddingsResponse:
+ type: object
+ properties:
+ embeddings:
+ type: array
+ items:
+ type: array
+ items:
+ type: number
+ description: >-
+ List of embedding vectors, one per input content. Each embedding is a
+ list of floats. The dimensionality of the embedding is model-specific;
+ you can check model metadata using /models/{model_id}
+ additionalProperties: false
+ required:
+ - embeddings
+ title: EmbeddingsResponse
+ description: >-
+ Response containing generated embeddings.
AgentCandidate:
type: object
properties:
@@ -7631,28 +8040,6 @@ components:
title: ScoringFnParamsType
description: >-
Types of scoring function parameter configurations.
- SystemMessage:
- type: object
- properties:
- role:
- type: string
- const: system
- default: system
- description: >-
- Must be "system" to identify this as a system message
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: >-
- The content of the "system prompt". If multiple system messages are provided,
- they are concatenated. The underlying Llama Stack code may also add other
- system messages (for example, for formatting tool definitions).
- additionalProperties: false
- required:
- - role
- - content
- title: SystemMessage
- description: >-
- A system message providing instructions or context to the model.
EvaluateRowsRequest:
type: object
properties:
@@ -12809,19 +13196,6 @@ components:
- metadata
title: ModerationObjectResults
description: A moderation object.
- Message:
- oneOf:
- - $ref: '#/components/schemas/UserMessage'
- - $ref: '#/components/schemas/SystemMessage'
- - $ref: '#/components/schemas/ToolResponseMessage'
- - $ref: '#/components/schemas/CompletionMessage'
- discriminator:
- propertyName: role
- mapping:
- user: '#/components/schemas/UserMessage'
- system: '#/components/schemas/SystemMessage'
- tool: '#/components/schemas/ToolResponseMessage'
- assistant: '#/components/schemas/CompletionMessage'
RunShieldRequest:
type: object
properties:
diff --git a/llama_stack/apis/batch_inference/__init__.py b/llama_stack/apis/batch_inference/__init__.py
deleted file mode 100644
index b9b2944b2..000000000
--- a/llama_stack/apis/batch_inference/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batch_inference import *
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
deleted file mode 100644
index 43ade0221..000000000
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack.apis.common.job_types import Job
-from llama_stack.apis.inference import (
- InterleavedContent,
- LogProbConfig,
- Message,
- ResponseFormat,
- SamplingParams,
- ToolChoice,
- ToolDefinition,
- ToolPromptFormat,
-)
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import webmethod
-
-
-@runtime_checkable
-class BatchInference(Protocol):
- """Batch inference API for generating completions and chat completions.
-
- This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
-
- NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
- including (post-training, evals, etc).
- """
-
- @webmethod(route="/batch-inference/completion", method="POST", level=LLAMA_STACK_API_V1)
- async def completion(
- self,
- model: str,
- content_batch: list[InterleavedContent],
- sampling_params: SamplingParams | None = None,
- response_format: ResponseFormat | None = None,
- logprobs: LogProbConfig | None = None,
- ) -> Job:
- """Generate completions for a batch of content.
-
- :param model: The model to use for the completion.
- :param content_batch: The content to complete.
- :param sampling_params: The sampling parameters to use for the completion.
- :param response_format: The response format to use for the completion.
- :param logprobs: The logprobs to use for the completion.
- :returns: A job for the completion.
- """
- ...
-
- @webmethod(route="/batch-inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
- async def chat_completion(
- self,
- model: str,
- messages_batch: list[list[Message]],
- sampling_params: SamplingParams | None = None,
- # zero-shot tool definitions as input to the model
- tools: list[ToolDefinition] | None = None,
- tool_choice: ToolChoice | None = ToolChoice.auto,
- tool_prompt_format: ToolPromptFormat | None = None,
- response_format: ResponseFormat | None = None,
- logprobs: LogProbConfig | None = None,
- ) -> Job:
- """Generate chat completions for a batch of messages.
-
- :param model: The model to use for the chat completion.
- :param messages_batch: The messages to complete.
- :param sampling_params: The sampling parameters to use for the completion.
- :param tools: The tools to use for the chat completion.
- :param tool_choice: The tool choice to use for the chat completion.
- :param tool_prompt_format: The tool prompt format to use for the chat completion.
- :param response_format: The response format to use for the chat completion.
- :param logprobs: The logprobs to use for the chat completion.
- :returns: A job for the chat completion.
- """
- ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 50be4a708..756896796 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -914,6 +914,7 @@ class OpenAIEmbeddingData(BaseModel):
"""
object: Literal["embedding"] = "embedding"
+ # TODO: consider dropping str and using openai.types.embeddings.Embedding instead of OpenAIEmbeddingData
embedding: list[float] | str
index: int
@@ -974,26 +975,6 @@ class EmbeddingTaskType(Enum):
document = "document"
-@json_schema_type
-class BatchCompletionResponse(BaseModel):
- """Response from a batch completion request.
-
- :param batch: List of completion responses, one for each input in the batch
- """
-
- batch: list[CompletionResponse]
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
- """Response from a batch chat completion request.
-
- :param batch: List of chat completion responses, one for each conversation in the batch
- """
-
- batch: list[ChatCompletionResponse]
-
-
class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
input_messages: list[OpenAIMessageParam]
@@ -1049,26 +1030,7 @@ class InferenceProvider(Protocol):
"""
...
- async def batch_completion(
- self,
- model_id: str,
- content_batch: list[InterleavedContent],
- sampling_params: SamplingParams | None = None,
- response_format: ResponseFormat | None = None,
- logprobs: LogProbConfig | None = None,
- ) -> BatchCompletionResponse:
- """Generate completions for a batch of content using the specified model.
-
- :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
- :param content_batch: The content to generate completions for.
- :param sampling_params: (Optional) Parameters to control the sampling strategy.
- :param response_format: (Optional) Grammar specification for guided (structured) decoding.
- :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
- :returns: A BatchCompletionResponse with the full completions.
- """
- raise NotImplementedError("Batch completion is not implemented")
- return # this is so mypy's safe-super rule will consider the method concrete
-
+ @webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
async def chat_completion(
self,
model_id: str,
@@ -1108,30 +1070,7 @@ class InferenceProvider(Protocol):
"""
...
- async def batch_chat_completion(
- self,
- model_id: str,
- messages_batch: list[list[Message]],
- sampling_params: SamplingParams | None = None,
- tools: list[ToolDefinition] | None = None,
- tool_config: ToolConfig | None = None,
- response_format: ResponseFormat | None = None,
- logprobs: LogProbConfig | None = None,
- ) -> BatchChatCompletionResponse:
- """Generate chat completions for a batch of messages using the specified model.
-
- :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
- :param messages_batch: The messages to generate completions for.
- :param sampling_params: (Optional) Parameters to control the sampling strategy.
- :param tools: (Optional) List of tool definitions available to the model.
- :param tool_config: (Optional) Configuration for tool use.
- :param response_format: (Optional) Grammar specification for guided (structured) decoding.
- :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
- :returns: A BatchChatCompletionResponse with the full completions.
- """
- raise NotImplementedError("Batch chat completion is not implemented")
- return # this is so mypy's safe-super rule will consider the method concrete
-
+ @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
async def embeddings(
self,
model_id: str,
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 762d7073e..fcf01a9c4 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -20,8 +20,6 @@ from llama_stack.apis.common.content_types import (
)
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
from llama_stack.apis.inference import (
- BatchChatCompletionResponse,
- BatchCompletionResponse,
ChatCompletionResponse,
ChatCompletionResponseEventType,
ChatCompletionResponseStreamChunk,
@@ -273,30 +271,6 @@ class InferenceRouter(Inference):
)
return response
- async def batch_chat_completion(
- self,
- model_id: str,
- messages_batch: list[list[Message]],
- tools: list[ToolDefinition] | None = None,
- tool_config: ToolConfig | None = None,
- sampling_params: SamplingParams | None = None,
- response_format: ResponseFormat | None = None,
- logprobs: LogProbConfig | None = None,
- ) -> BatchChatCompletionResponse:
- logger.debug(
- f"InferenceRouter.batch_chat_completion: {model_id=}, {len(messages_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
- )
- provider = await self.routing_table.get_provider_impl(model_id)
- return await provider.batch_chat_completion(
- model_id=model_id,
- messages_batch=messages_batch,
- tools=tools,
- tool_config=tool_config,
- sampling_params=sampling_params,
- response_format=response_format,
- logprobs=logprobs,
- )
-
async def completion(
self,
model_id: str,
@@ -338,20 +312,6 @@ class InferenceRouter(Inference):
return response
- async def batch_completion(
- self,
- model_id: str,
- content_batch: list[InterleavedContent],
- sampling_params: SamplingParams | None = None,
- response_format: ResponseFormat | None = None,
- logprobs: LogProbConfig | None = None,
- ) -> BatchCompletionResponse:
- logger.debug(
- f"InferenceRouter.batch_completion: {model_id=}, {len(content_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
- )
- provider = await self.routing_table.get_provider_impl(model_id)
- return await provider.batch_completion(model_id, content_batch, sampling_params, response_format, logprobs)
-
async def embeddings(
self,
model_id: str,
diff --git a/llama_stack/core/routing_tables/toolgroups.py b/llama_stack/core/routing_tables/toolgroups.py
index eeea406c1..8172b9b5f 100644
--- a/llama_stack/core/routing_tables/toolgroups.py
+++ b/llama_stack/core/routing_tables/toolgroups.py
@@ -9,7 +9,7 @@ from typing import Any
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.errors import ToolGroupNotFoundError
from llama_stack.apis.tools import ListToolGroupsResponse, ListToolsResponse, Tool, ToolGroup, ToolGroups
-from llama_stack.core.datatypes import ToolGroupWithOwner
+from llama_stack.core.datatypes import AuthenticationRequiredError, ToolGroupWithOwner
from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl
@@ -54,7 +54,18 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
all_tools = []
for toolgroup in toolgroups:
if toolgroup.identifier not in self.toolgroups_to_tools:
- await self._index_tools(toolgroup)
+ try:
+ await self._index_tools(toolgroup)
+ except AuthenticationRequiredError:
+ # Send authentication errors back to the client so it knows
+ # that it needs to supply credentials for remote MCP servers.
+ raise
+ except Exception as e:
+ # Other errors that the client cannot fix are logged and
+ # those specific toolgroups are skipped.
+ logger.warning(f"Error listing tools for toolgroup {toolgroup.identifier}: {e}")
+ logger.debug(e, exc_info=True)
+ continue
all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier])
return ListToolsResponse(data=all_tools)
diff --git a/llama_stack/core/stack.py b/llama_stack/core/stack.py
index a6c5093eb..3e14328a3 100644
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@@ -14,7 +14,6 @@ from typing import Any
import yaml
from llama_stack.apis.agents import Agents
-from llama_stack.apis.batch_inference import BatchInference
from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
@@ -54,7 +53,6 @@ class LlamaStack(
Providers,
VectorDBs,
Inference,
- BatchInference,
Agents,
Safety,
SyntheticDataGeneration,
diff --git a/llama_stack/core/store/registry.py b/llama_stack/core/store/registry.py
index a764d692a..5f4abe9aa 100644
--- a/llama_stack/core/store/registry.py
+++ b/llama_stack/core/store/registry.py
@@ -96,11 +96,9 @@ class DiskDistributionRegistry(DistributionRegistry):
async def register(self, obj: RoutableObjectWithProvider) -> bool:
existing_obj = await self.get(obj.type, obj.identifier)
- # warn if the object's providerid is different but proceed with registration
- if existing_obj and existing_obj.provider_id != obj.provider_id:
- logger.warning(
- f"Object {existing_obj.type}:{existing_obj.identifier}'s {existing_obj.provider_id} provider is being replaced with {obj.provider_id}"
- )
+ # dont register if the object's providerid already exists
+ if existing_obj and existing_obj.provider_id == obj.provider_id:
+ return False
await self.kvstore.set(
KEY_FORMAT.format(type=obj.type, identifier=obj.identifier),
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 88d7a98ec..f9e295014 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -18,8 +18,6 @@ from llama_stack.apis.common.content_types import (
ToolCallParseStatus,
)
from llama_stack.apis.inference import (
- BatchChatCompletionResponse,
- BatchCompletionResponse,
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseEvent,
@@ -219,41 +217,6 @@ class MetaReferenceInferenceImpl(
results = await self._nonstream_completion([request])
return results[0]
- async def batch_completion(
- self,
- model_id: str,
- content_batch: list[InterleavedContent],
- sampling_params: SamplingParams | None = None,
- response_format: ResponseFormat | None = None,
- stream: bool | None = False,
- logprobs: LogProbConfig | None = None,
- ) -> BatchCompletionResponse:
- if sampling_params is None:
- sampling_params = SamplingParams()
- if logprobs:
- assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
-
- content_batch = [
- augment_content_with_response_format_prompt(response_format, content) for content in content_batch
- ]
-
- request_batch = []
- for content in content_batch:
- request = CompletionRequest(
- model=model_id,
- content=content,
- sampling_params=sampling_params,
- response_format=response_format,
- stream=stream,
- logprobs=logprobs,
- )
- self.check_model(request)
- request = await convert_request_to_raw(request)
- request_batch.append(request)
-
- results = await self._nonstream_completion(request_batch)
- return BatchCompletionResponse(batch=results)
-
async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
tokenizer = self.generator.formatter.tokenizer
@@ -399,49 +362,6 @@ class MetaReferenceInferenceImpl(
results = await self._nonstream_chat_completion([request])
return results[0]
- async def batch_chat_completion(
- self,
- model_id: str,
- messages_batch: list[list[Message]],
- sampling_params: SamplingParams | None = None,
- response_format: ResponseFormat | None = None,
- tools: list[ToolDefinition] | None = None,
- stream: bool | None = False,
- logprobs: LogProbConfig | None = None,
- tool_config: ToolConfig | None = None,
- ) -> BatchChatCompletionResponse:
- if sampling_params is None:
- sampling_params = SamplingParams()
- if logprobs:
- assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
-
- # wrapper request to make it easier to pass around (internal only, not exposed to API)
- request_batch = []
- for messages in messages_batch:
- request = ChatCompletionRequest(
- model=model_id,
- messages=messages,
- sampling_params=sampling_params,
- tools=tools or [],
- response_format=response_format,
- logprobs=logprobs,
- tool_config=tool_config or ToolConfig(),
- )
- self.check_model(request)
-
- # augment and rewrite messages depending on the model
- request.messages = chat_completion_request_to_messages(request, self.llama_model.core_model_id.value)
- # download media and convert to raw content so we can send it to the model
- request = await convert_request_to_raw(request)
- request_batch.append(request)
-
- if self.config.create_distributed_process_group:
- if SEMAPHORE.locked():
- raise RuntimeError("Only one concurrent request is supported")
-
- results = await self._nonstream_chat_completion(request_batch)
- return BatchChatCompletionResponse(batch=results)
-
async def _nonstream_chat_completion(
self, request_batch: list[ChatCompletionRequest]
) -> list[ChatCompletionResponse]:
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 25fd9f3b7..6eac6e4f4 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -24,7 +24,6 @@ from llama_stack.apis.inference import (
LogProbConfig,
Message,
Model,
- ModelType,
OpenAICompletion,
ResponseFormat,
SamplingParams,
@@ -34,6 +33,7 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
+from llama_stack.apis.models import ModelType
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index cf7e93974..1025bfb53 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -64,6 +64,7 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
}
def __init__(self, config: FireworksImplConfig) -> None:
+ ModelRegistryHelper.__init__(self)
self.config = config
self.allowed_models = config.allowed_models
diff --git a/llama_stack/providers/remote/inference/groq/__init__.py b/llama_stack/providers/remote/inference/groq/__init__.py
index 1506e0b06..cca333ccf 100644
--- a/llama_stack/providers/remote/inference/groq/__init__.py
+++ b/llama_stack/providers/remote/inference/groq/__init__.py
@@ -4,12 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from llama_stack.apis.inference import Inference
-
from .config import GroqConfig
-async def get_adapter_impl(config: GroqConfig, _deps) -> Inference:
+async def get_adapter_impl(config: GroqConfig, _deps):
# import dynamically so the import is used only when it is needed
from .groq import GroqInferenceAdapter
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 81a5fb9ad..3fb10445f 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -6,8 +6,7 @@
import asyncio
-import base64
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator
from typing import Any
from ollama import AsyncClient as AsyncOllamaClient
@@ -33,10 +32,6 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat,
LogProbConfig,
Message,
- OpenAIChatCompletion,
- OpenAIChatCompletionChunk,
- OpenAIMessageParam,
- OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -62,7 +57,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
get_sampling_options,
- prepare_openai_completion_params,
process_chat_completion_response,
process_chat_completion_stream_response,
process_completion_response,
@@ -75,7 +69,6 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
content_has_media,
convert_image_content_to_url,
interleaved_content_as_str,
- localize_image_content,
request_has_media,
)
@@ -84,6 +77,7 @@ logger = get_logger(name=__name__, category="inference::ollama")
class OllamaInferenceAdapter(
OpenAIMixin,
+ ModelRegistryHelper,
InferenceProvider,
ModelsProtocolPrivate,
):
@@ -129,6 +123,8 @@ class OllamaInferenceAdapter(
],
)
self.config = config
+ # Ollama does not support image urls, so we need to download the image and convert it to base64
+ self.download_images = True
self._clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}
@property
@@ -173,9 +169,6 @@ class OllamaInferenceAdapter(
async def shutdown(self) -> None:
self._clients.clear()
- async def unregister_model(self, model_id: str) -> None:
- pass
-
async def _get_model(self, model_id: str) -> Model:
if not self.model_store:
raise ValueError("Model store not set")
@@ -403,75 +396,6 @@ class OllamaInferenceAdapter(
raise UnsupportedModelError(model.provider_model_id, list(self._model_cache.keys()))
- async def openai_chat_completion(
- self,
- model: str,
- messages: list[OpenAIMessageParam],
- frequency_penalty: float | None = None,
- function_call: str | dict[str, Any] | None = None,
- functions: list[dict[str, Any]] | None = None,
- logit_bias: dict[str, float] | None = None,
- logprobs: bool | None = None,
- max_completion_tokens: int | None = None,
- max_tokens: int | None = None,
- n: int | None = None,
- parallel_tool_calls: bool | None = None,
- presence_penalty: float | None = None,
- response_format: OpenAIResponseFormatParam | None = None,
- seed: int | None = None,
- stop: str | list[str] | None = None,
- stream: bool | None = None,
- stream_options: dict[str, Any] | None = None,
- temperature: float | None = None,
- tool_choice: str | dict[str, Any] | None = None,
- tools: list[dict[str, Any]] | None = None,
- top_logprobs: int | None = None,
- top_p: float | None = None,
- user: str | None = None,
- ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
- model_obj = await self._get_model(model)
-
- # Ollama does not support image urls, so we need to download the image and convert it to base64
- async def _convert_message(m: OpenAIMessageParam) -> OpenAIMessageParam:
- if isinstance(m.content, list):
- for c in m.content:
- if c.type == "image_url" and c.image_url and c.image_url.url:
- localize_result = await localize_image_content(c.image_url.url)
- if localize_result is None:
- raise ValueError(f"Failed to localize image content from {c.image_url.url}")
-
- content, format = localize_result
- c.image_url.url = f"data:image/{format};base64,{base64.b64encode(content).decode('utf-8')}"
- return m
-
- messages = [await _convert_message(m) for m in messages]
- params = await prepare_openai_completion_params(
- model=model_obj.provider_resource_id,
- messages=messages,
- frequency_penalty=frequency_penalty,
- function_call=function_call,
- functions=functions,
- logit_bias=logit_bias,
- logprobs=logprobs,
- max_completion_tokens=max_completion_tokens,
- max_tokens=max_tokens,
- n=n,
- parallel_tool_calls=parallel_tool_calls,
- presence_penalty=presence_penalty,
- response_format=response_format,
- seed=seed,
- stop=stop,
- stream=stream,
- stream_options=stream_options,
- temperature=temperature,
- tool_choice=tool_choice,
- tools=tools,
- top_logprobs=top_logprobs,
- top_p=top_p,
- user=user,
- )
- return await OpenAIMixin.openai_chat_completion(self, **params)
-
async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
async def _convert_content(content) -> dict:
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index 18530f20b..9b341ede2 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -21,8 +21,6 @@ logger = get_logger(name=__name__, category="inference::openai")
# | completion | LiteLLMOpenAIMixin |
# | chat_completion | LiteLLMOpenAIMixin |
# | embedding | LiteLLMOpenAIMixin |
-# | batch_completion | LiteLLMOpenAIMixin |
-# | batch_chat_completion | LiteLLMOpenAIMixin |
# | openai_completion | OpenAIMixin |
# | openai_chat_completion | OpenAIMixin |
# | openai_embeddings | OpenAIMixin |
diff --git a/llama_stack/providers/remote/inference/sambanova/__init__.py b/llama_stack/providers/remote/inference/sambanova/__init__.py
index a3a7b8fbd..2a5448041 100644
--- a/llama_stack/providers/remote/inference/sambanova/__init__.py
+++ b/llama_stack/providers/remote/inference/sambanova/__init__.py
@@ -4,12 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from llama_stack.apis.inference import Inference
-
from .config import SambaNovaImplConfig
-async def get_adapter_impl(config: SambaNovaImplConfig, _deps) -> Inference:
+async def get_adapter_impl(config: SambaNovaImplConfig, _deps):
from .sambanova import SambaNovaInferenceAdapter
assert isinstance(config, SambaNovaImplConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index 6121e81f7..4d8fd11cd 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -25,7 +25,7 @@ class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: SambaNovaImplConfig):
self.config = config
- self.environment_available_models = []
+ self.environment_available_models: list[str] = []
LiteLLMOpenAIMixin.__init__(
self,
litellm_provider_name="sambanova",
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 37973d635..c199677be 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -70,6 +70,7 @@ class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Need
}
def __init__(self, config: TogetherImplConfig) -> None:
+ ModelRegistryHelper.__init__(self)
self.config = config
self.allowed_models = config.allowed_models
self._model_cache: dict[str, Model] = {}
diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py
index ff15b2d43..746ebd8f6 100644
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@@ -20,7 +20,7 @@ logger = get_logger(name=__name__, category="providers::utils")
class RemoteInferenceProviderConfig(BaseModel):
- allowed_models: list[str] | None = Field(
+ allowed_models: list[str] | None = Field( # TODO: make this non-optional and give a list() default
default=None,
description="List of models that should be registered with the model registry. If None, all models are allowed.",
)
diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py
index 84211dc96..7da97e6b1 100644
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@@ -4,6 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+import base64
import uuid
from abc import ABC, abstractmethod
from collections.abc import AsyncIterator
@@ -26,6 +27,7 @@ from llama_stack.apis.models import ModelType
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
+from llama_stack.providers.utils.inference.prompt_adapter import localize_image_content
logger = get_logger(name=__name__, category="providers::utils")
@@ -51,6 +53,10 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
# This is useful for providers that do not return a unique id in the response.
overwrite_completion_id: bool = False
+ # Allow subclasses to control whether to download images and convert to base64
+ # for providers that require base64 encoded images instead of URLs.
+ download_images: bool = False
+
# Embedding model metadata for this provider
# Can be set by subclasses or instances to provide embedding models
# Format: {"model_id": {"embedding_dimension": 1536, "context_length": 8192}}
@@ -239,6 +245,24 @@ class OpenAIMixin(ModelRegistryHelper, ABC):
"""
Direct OpenAI chat completion API call.
"""
+ if self.download_images:
+
+ async def _localize_image_url(m: OpenAIMessageParam) -> OpenAIMessageParam:
+ if isinstance(m.content, list):
+ for c in m.content:
+ if c.type == "image_url" and c.image_url and c.image_url.url and "http" in c.image_url.url:
+ localize_result = await localize_image_content(c.image_url.url)
+ if localize_result is None:
+ raise ValueError(
+ f"Failed to localize image content from {c.image_url.url[:42]}{'...' if len(c.image_url.url) > 42 else ''}"
+ )
+ content, format = localize_result
+ c.image_url.url = f"data:image/{format};base64,{base64.b64encode(content).decode('utf-8')}"
+ # else it's a string and we don't need to modify it
+ return m
+
+ messages = [await _localize_image_url(m) for m in messages]
+
resp = await self.client.chat.completions.create(
**await prepare_openai_completion_params(
model=await self._get_provider_model_id(model),
diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py
index d1747d65b..7b6a79350 100644
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@@ -28,7 +28,7 @@ class CommonConfig(BaseModel):
class RedisKVStoreConfig(CommonConfig):
- type: Literal[KVStoreType.redis.value] = KVStoreType.redis.value
+ type: Literal["redis"] = KVStoreType.redis.value
host: str = "localhost"
port: int = 6379
@@ -50,7 +50,7 @@ class RedisKVStoreConfig(CommonConfig):
class SqliteKVStoreConfig(CommonConfig):
- type: Literal[KVStoreType.sqlite.value] = KVStoreType.sqlite.value
+ type: Literal["sqlite"] = KVStoreType.sqlite.value
db_path: str = Field(
default=(RUNTIME_BASE_DIR / "kvstore.db").as_posix(),
description="File path for the sqlite database",
@@ -69,7 +69,7 @@ class SqliteKVStoreConfig(CommonConfig):
class PostgresKVStoreConfig(CommonConfig):
- type: Literal[KVStoreType.postgres.value] = KVStoreType.postgres.value
+ type: Literal["postgres"] = KVStoreType.postgres.value
host: str = "localhost"
port: int = 5432
db: str = "llamastack"
@@ -113,11 +113,11 @@ class PostgresKVStoreConfig(CommonConfig):
class MongoDBKVStoreConfig(CommonConfig):
- type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value
+ type: Literal["mongodb"] = KVStoreType.mongodb.value
host: str = "localhost"
port: int = 27017
db: str = "llamastack"
- user: str = None
+ user: str | None = None
password: str | None = None
collection_name: str = "llamastack_kvstore"
diff --git a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
index bab87a4aa..4d60949c1 100644
--- a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
+++ b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
@@ -7,6 +7,7 @@
from datetime import datetime
from pymongo import AsyncMongoClient
+from pymongo.asynchronous.collection import AsyncCollection
from llama_stack.log import get_logger
from llama_stack.providers.utils.kvstore import KVStore
@@ -19,8 +20,13 @@ log = get_logger(name=__name__, category="providers::utils")
class MongoDBKVStoreImpl(KVStore):
def __init__(self, config: MongoDBKVStoreConfig):
self.config = config
- self.conn = None
- self.collection = None
+ self.conn: AsyncMongoClient | None = None
+
+ @property
+ def collection(self) -> AsyncCollection:
+ if self.conn is None:
+ raise RuntimeError("MongoDB connection is not initialized")
+ return self.conn[self.config.db][self.config.collection_name]
async def initialize(self) -> None:
try:
@@ -32,7 +38,6 @@ class MongoDBKVStoreImpl(KVStore):
}
conn_creds = {k: v for k, v in conn_creds.items() if v is not None}
self.conn = AsyncMongoClient(**conn_creds)
- self.collection = self.conn[self.config.db][self.config.collection_name]
except Exception as e:
log.exception("Could not connect to MongoDB database server")
raise RuntimeError("Could not connect to MongoDB database server") from e
diff --git a/llama_stack/providers/utils/kvstore/sqlite/sqlite.py b/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
index 6a6a170dc..5b782902e 100644
--- a/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
+++ b/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
@@ -9,9 +9,13 @@ from datetime import datetime
import aiosqlite
+from llama_stack.log import get_logger
+
from ..api import KVStore
from ..config import SqliteKVStoreConfig
+logger = get_logger(name=__name__, category="providers::utils")
+
class SqliteKVStoreImpl(KVStore):
def __init__(self, config: SqliteKVStoreConfig):
@@ -50,6 +54,9 @@ class SqliteKVStoreImpl(KVStore):
if row is None:
return None
value, expiration = row
+ if not isinstance(value, str):
+ logger.warning(f"Expected string value for key {key}, got {type(value)}, returning None")
+ return None
return value
async def delete(self, key: str) -> None:
diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index 638655639..21dec59c3 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -18,7 +18,7 @@
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"framer-motion": "^12.23.12",
- "llama-stack-client": "^0.2.22",
+ "llama-stack-client": "^0.2.23",
"lucide-react": "^0.542.0",
"next": "15.5.3",
"next-auth": "^4.24.11",
@@ -10172,9 +10172,9 @@
"license": "MIT"
},
"node_modules/llama-stack-client": {
- "version": "0.2.22",
- "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.22.tgz",
- "integrity": "sha512-7aW3UQj5MwjV73Brd+yQ1e4W1W33nhozyeHM5tzOgbsVZ88tL78JNiNvyFqDR5w6V9XO4/uSGGiQVG6v83yR4w==",
+ "version": "0.2.23",
+ "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.23.tgz",
+ "integrity": "sha512-J3YFH1HW2K70capejQxGlCyTgKdfx+sQf8Ab+HFi1j2Q00KtpHXB79RxejvBxjWC3X2E++P9iU57KdU2Tp/rIQ==",
"license": "MIT",
"dependencies": {
"@types/node": "^18.11.18",
diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json
index 2ba81ea84..70462b534 100644
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@@ -23,7 +23,7 @@
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"framer-motion": "^12.23.12",
- "llama-stack-client": "^0.2.22",
+ "llama-stack-client": "^0.2.23",
"lucide-react": "^0.542.0",
"next": "15.5.3",
"next-auth": "^4.24.11",
diff --git a/pyproject.toml b/pyproject.toml
index 86a32f978..98bae47c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ required-version = ">=0.7.0"
[project]
name = "llama_stack"
-version = "0.2.22"
+version = "0.2.23"
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
description = "Llama Stack"
readme = "README.md"
@@ -31,7 +31,7 @@ dependencies = [
"huggingface-hub>=0.34.0,<1.0",
"jinja2>=3.1.6",
"jsonschema",
- "llama-stack-client>=0.2.22",
+ "llama-stack-client>=0.2.23",
"openai>=1.100.0", # for expires_after support
"prompt-toolkit",
"python-dotenv",
@@ -55,7 +55,7 @@ dependencies = [
ui = [
"streamlit",
"pandas",
- "llama-stack-client>=0.2.22",
+ "llama-stack-client>=0.2.23",
"streamlit-option-menu",
]
@@ -259,15 +259,12 @@ exclude = [
"^llama_stack/models/llama/llama3/tokenizer\\.py$",
"^llama_stack/models/llama/llama3/tool_utils\\.py$",
"^llama_stack/providers/inline/agents/meta_reference/",
- "^llama_stack/providers/inline/agents/meta_reference/agent_instance\\.py$",
- "^llama_stack/providers/inline/agents/meta_reference/agents\\.py$",
"^llama_stack/providers/inline/datasetio/localfs/",
"^llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
"^llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
"^llama_stack/models/llama/llama3/generation\\.py$",
"^llama_stack/models/llama/llama3/multimodal/model\\.py$",
"^llama_stack/models/llama/llama4/",
- "^llama_stack/providers/inline/inference/meta_reference/quantization/fp8_impls\\.py$",
"^llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$",
"^llama_stack/providers/inline/post_training/common/validator\\.py$",
"^llama_stack/providers/inline/safety/code_scanner/",
@@ -278,19 +275,13 @@ exclude = [
"^llama_stack/providers/remote/agents/sample/",
"^llama_stack/providers/remote/datasetio/huggingface/",
"^llama_stack/providers/remote/datasetio/nvidia/",
- "^llama_stack/providers/remote/inference/anthropic/",
"^llama_stack/providers/remote/inference/bedrock/",
"^llama_stack/providers/remote/inference/cerebras/",
"^llama_stack/providers/remote/inference/databricks/",
"^llama_stack/providers/remote/inference/fireworks/",
- "^llama_stack/providers/remote/inference/gemini/",
- "^llama_stack/providers/remote/inference/groq/",
"^llama_stack/providers/remote/inference/nvidia/",
- "^llama_stack/providers/remote/inference/openai/",
"^llama_stack/providers/remote/inference/passthrough/",
"^llama_stack/providers/remote/inference/runpod/",
- "^llama_stack/providers/remote/inference/sambanova/",
- "^llama_stack/providers/remote/inference/sample/",
"^llama_stack/providers/remote/inference/tgi/",
"^llama_stack/providers/remote/inference/together/",
"^llama_stack/providers/remote/inference/watsonx/",
@@ -310,7 +301,6 @@ exclude = [
"^llama_stack/providers/remote/vector_io/qdrant/",
"^llama_stack/providers/remote/vector_io/sample/",
"^llama_stack/providers/remote/vector_io/weaviate/",
- "^llama_stack/providers/tests/conftest\\.py$",
"^llama_stack/providers/utils/bedrock/client\\.py$",
"^llama_stack/providers/utils/bedrock/refreshable_boto_session\\.py$",
"^llama_stack/providers/utils/inference/embedding_mixin\\.py$",
@@ -318,12 +308,9 @@ exclude = [
"^llama_stack/providers/utils/inference/model_registry\\.py$",
"^llama_stack/providers/utils/inference/openai_compat\\.py$",
"^llama_stack/providers/utils/inference/prompt_adapter\\.py$",
- "^llama_stack/providers/utils/kvstore/config\\.py$",
"^llama_stack/providers/utils/kvstore/kvstore\\.py$",
- "^llama_stack/providers/utils/kvstore/mongodb/mongodb\\.py$",
"^llama_stack/providers/utils/kvstore/postgres/postgres\\.py$",
"^llama_stack/providers/utils/kvstore/redis/redis\\.py$",
- "^llama_stack/providers/utils/kvstore/sqlite/sqlite\\.py$",
"^llama_stack/providers/utils/memory/vector_store\\.py$",
"^llama_stack/providers/utils/scoring/aggregation_utils\\.py$",
"^llama_stack/providers/utils/scoring/base_scoring_fn\\.py$",
@@ -331,13 +318,6 @@ exclude = [
"^llama_stack/providers/utils/telemetry/trace_protocol\\.py$",
"^llama_stack/providers/utils/telemetry/tracing\\.py$",
"^llama_stack/strong_typing/auxiliary\\.py$",
- "^llama_stack/strong_typing/deserializer\\.py$",
- "^llama_stack/strong_typing/inspection\\.py$",
- "^llama_stack/strong_typing/schema\\.py$",
- "^llama_stack/strong_typing/serializer\\.py$",
- "^llama_stack/distributions/groq/groq\\.py$",
- "^llama_stack/distributions/llama_api/llama_api\\.py$",
- "^llama_stack/distributions/sambanova/sambanova\\.py$",
"^llama_stack/distributions/template\\.py$",
]
diff --git a/tests/integration/inference/test_openai_vision_inference.py b/tests/integration/inference/test_openai_vision_inference.py
new file mode 100644
index 000000000..02a41c633
--- /dev/null
+++ b/tests/integration/inference/test_openai_vision_inference.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import base64
+import pathlib
+
+import pytest
+
+
+@pytest.fixture
+def image_path():
+ return pathlib.Path(__file__).parent / "dog.png"
+
+
+@pytest.fixture
+def base64_image_data(image_path):
+ return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+async def test_openai_chat_completion_image_url(openai_client, vision_model_id):
+ message = {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
+ },
+ },
+ {
+ "type": "text",
+ "text": "Describe what is in this image.",
+ },
+ ],
+ }
+
+ response = openai_client.chat.completions.create(
+ model=vision_model_id,
+ messages=[message],
+ stream=False,
+ )
+
+ message_content = response.choices[0].message.content.lower().strip()
+ assert len(message_content) > 0
+ assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
+
+
+async def test_openai_chat_completion_image_data(openai_client, vision_model_id, base64_image_data):
+ message = {
+ "role": "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/png;base64,{base64_image_data}",
+ },
+ },
+ {
+ "type": "text",
+ "text": "Describe what is in this image.",
+ },
+ ],
+ }
+
+ response = openai_client.chat.completions.create(
+ model=vision_model_id,
+ messages=[message],
+ stream=False,
+ )
+
+ message_content = response.choices[0].message.content.lower().strip()
+ assert len(message_content) > 0
+ assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
index bbfea3f46..456a5d041 100644
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@@ -10,6 +10,7 @@ from unittest.mock import AsyncMock
import pytest
+from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import NumberType
from llama_stack.apis.datasets.datasets import Dataset, DatasetPurpose, URIDataSource
from llama_stack.apis.datatypes import Api
@@ -645,3 +646,25 @@ async def test_models_source_interaction_cleanup_provider_models(cached_disk_dis
# Cleanup
await table.shutdown()
+
+
+async def test_tool_groups_routing_table_exception_handling(cached_disk_dist_registry):
+ """Test that the tool group routing table handles exceptions when listing tools, like if an MCP server is unreachable."""
+
+ exception_throwing_tool_groups_impl = ToolGroupsImpl()
+ exception_throwing_tool_groups_impl.list_runtime_tools = AsyncMock(side_effect=Exception("Test exception"))
+
+ table = ToolGroupsRoutingTable(
+ {"test_provider": exception_throwing_tool_groups_impl}, cached_disk_dist_registry, {}
+ )
+ await table.initialize()
+
+ await table.register_tool_group(
+ toolgroup_id="test-toolgroup-exceptions",
+ provider_id="test_provider",
+ mcp_endpoint=URL(uri="http://localhost:8479/foo/bar"),
+ )
+
+ tools = await table.list_tools(toolgroup_id="test-toolgroup-exceptions")
+
+ assert len(tools.data) == 0
diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py
index d62292542..b55f206b9 100644
--- a/tests/unit/providers/utils/inference/test_openai_mixin.py
+++ b/tests/unit/providers/utils/inference/test_openai_mixin.py
@@ -4,11 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from unittest.mock import MagicMock, PropertyMock, patch
+from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
import pytest
-from llama_stack.apis.inference import Model
+from llama_stack.apis.inference import Model, OpenAIUserMessageParam
from llama_stack.apis.models import ModelType
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@@ -43,8 +43,17 @@ class OpenAIMixinWithEmbeddingsImpl(OpenAIMixin):
@pytest.fixture
def mixin():
- """Create a test instance of OpenAIMixin"""
- return OpenAIMixinImpl()
+ """Create a test instance of OpenAIMixin with mocked model_store"""
+ mixin_instance = OpenAIMixinImpl()
+
+ # just enough to satisfy _get_provider_model_id calls
+ mock_model_store = MagicMock()
+ mock_model = MagicMock()
+ mock_model.provider_resource_id = "test-provider-resource-id"
+ mock_model_store.get_model = AsyncMock(return_value=mock_model)
+ mixin_instance.model_store = mock_model_store
+
+ return mixin_instance
@pytest.fixture
@@ -205,6 +214,74 @@ class TestOpenAIMixinCacheBehavior:
assert "final-mock-model-id" in mixin._model_cache
+class TestOpenAIMixinImagePreprocessing:
+ """Test cases for image preprocessing functionality"""
+
+ async def test_openai_chat_completion_with_image_preprocessing_enabled(self, mixin):
+ """Test that image URLs are converted to base64 when download_images is True"""
+ mixin.download_images = True
+
+ message = OpenAIUserMessageParam(
+ role="user",
+ content=[
+ {"type": "text", "text": "What's in this image?"},
+ {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}},
+ ],
+ )
+
+ mock_client = MagicMock()
+ mock_response = MagicMock()
+ mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+ with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client):
+ with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize:
+ mock_localize.return_value = (b"fake_image_data", "jpeg")
+
+ await mixin.openai_chat_completion(model="test-model", messages=[message])
+
+ mock_localize.assert_called_once_with("http://example.com/image.jpg")
+
+ mock_client.chat.completions.create.assert_called_once()
+ call_args = mock_client.chat.completions.create.call_args
+ processed_messages = call_args[1]["messages"]
+ assert len(processed_messages) == 1
+ content = processed_messages[0]["content"]
+ assert len(content) == 2
+ assert content[0]["type"] == "text"
+ assert content[1]["type"] == "image_url"
+ assert content[1]["image_url"]["url"] == ""
+
+ async def test_openai_chat_completion_with_image_preprocessing_disabled(self, mixin):
+ """Test that image URLs are not modified when download_images is False"""
+ mixin.download_images = False # explicitly set to False
+
+ message = OpenAIUserMessageParam(
+ role="user",
+ content=[
+ {"type": "text", "text": "What's in this image?"},
+ {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}},
+ ],
+ )
+
+ mock_client = MagicMock()
+ mock_response = MagicMock()
+ mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+ with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client):
+ with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize:
+ await mixin.openai_chat_completion(model="test-model", messages=[message])
+
+ mock_localize.assert_not_called()
+
+ mock_client.chat.completions.create.assert_called_once()
+ call_args = mock_client.chat.completions.create.call_args
+ processed_messages = call_args[1]["messages"]
+ assert len(processed_messages) == 1
+ content = processed_messages[0]["content"]
+ assert len(content) == 2
+ assert content[1]["image_url"]["url"] == "http://example.com/image.jpg"
+
+
class TestOpenAIMixinEmbeddingModelMetadata:
"""Test cases for embedding_model_metadata attribute functionality"""
diff --git a/tests/unit/registry/test_registry.py b/tests/unit/registry/test_registry.py
index 9873bec5b..4ea4a20b9 100644
--- a/tests/unit/registry/test_registry.py
+++ b/tests/unit/registry/test_registry.py
@@ -129,7 +129,7 @@ async def test_duplicate_provider_registration(cached_disk_dist_registry):
result = await cached_disk_dist_registry.get("vector_db", "test_vector_db_2")
assert result is not None
- assert result.embedding_model == duplicate_vector_db.embedding_model # Original values preserved
+ assert result.embedding_model == original_vector_db.embedding_model # Original values preserved
async def test_get_all_objects(cached_disk_dist_registry):
@@ -174,14 +174,10 @@ async def test_parse_registry_values_error_handling(sqlite_kvstore):
)
await sqlite_kvstore.set(
- KEY_FORMAT.format(type="vector_db", identifier="valid_vector_db"),
- valid_db.model_dump_json(),
+ KEY_FORMAT.format(type="vector_db", identifier="valid_vector_db"), valid_db.model_dump_json()
)
- await sqlite_kvstore.set(
- KEY_FORMAT.format(type="vector_db", identifier="corrupted_json"),
- "{not valid json",
- )
+ await sqlite_kvstore.set(KEY_FORMAT.format(type="vector_db", identifier="corrupted_json"), "{not valid json")
await sqlite_kvstore.set(
KEY_FORMAT.format(type="vector_db", identifier="missing_fields"),
@@ -216,8 +212,7 @@ async def test_cached_registry_error_handling(sqlite_kvstore):
)
await sqlite_kvstore.set(
- KEY_FORMAT.format(type="vector_db", identifier="valid_cached_db"),
- valid_db.model_dump_json(),
+ KEY_FORMAT.format(type="vector_db", identifier="valid_cached_db"), valid_db.model_dump_json()
)
await sqlite_kvstore.set(
diff --git a/uv.lock b/uv.lock
index 0833a9d77..63639ee4a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1749,7 +1749,7 @@ wheels = [
[[package]]
name = "llama-stack"
-version = "0.2.22"
+version = "0.2.23"
source = { editable = "." }
dependencies = [
{ name = "aiohttp" },
@@ -1885,8 +1885,8 @@ requires-dist = [
{ name = "huggingface-hub", specifier = ">=0.34.0,<1.0" },
{ name = "jinja2", specifier = ">=3.1.6" },
{ name = "jsonschema" },
- { name = "llama-stack-client", specifier = ">=0.2.22" },
- { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.22" },
+ { name = "llama-stack-client", specifier = ">=0.2.23" },
+ { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.23" },
{ name = "openai", specifier = ">=1.100.0" },
{ name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" },
{ name = "opentelemetry-sdk", specifier = ">=1.30.0" },
@@ -1993,7 +1993,7 @@ unit = [
[[package]]
name = "llama-stack-client"
-version = "0.2.22"
+version = "0.2.23"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
@@ -2012,9 +2012,9 @@ dependencies = [
{ name = "tqdm" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/60/80/4260816bfaaa889d515206c9df4906d08d405bf94c9b4d1be399b1923e46/llama_stack_client-0.2.22.tar.gz", hash = "sha256:9a0bc756b91ebd539858eeaf1f231c5e5c6900e1ea4fcced726c6717f3d27ca7", size = 318309, upload-time = "2025-09-16T19:43:33.212Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/9f/8f/306d5fcf2f97b3a6251219b03c194836a2ff4e0fcc8146c9970e50a72cd3/llama_stack_client-0.2.23.tar.gz", hash = "sha256:68f34e8ac8eea6a73ed9d4977d849992b2d8bd835804d770a11843431cd5bf74", size = 322288, upload-time = "2025-09-26T21:11:08.342Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/d1/8e/1ebf6ac0dbb62b81038e856ed00768e283d927b14fcd614e3018a227092b/llama_stack_client-0.2.22-py3-none-any.whl", hash = "sha256:b260d73aec56fcfd8fa601b3b34c2f83c4fbcfb7261a246b02bbdf6c2da184fe", size = 369901, upload-time = "2025-09-16T19:43:32.089Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/75/3eb58e092a681804013dbec7b7f549d18f55acf6fd6e6b27de7e249766d8/llama_stack_client-0.2.23-py3-none-any.whl", hash = "sha256:eee42c74eee8f218f9455e5a06d5d4be43f8a8c82a7937ef51ce367f916df847", size = 379809, upload-time = "2025-09-26T21:11:06.856Z" },
]
[[package]]