diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 1a59369cb..48109e5d8 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification # noqa: E402
+def str_presenter(dumper, data):
+ if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
+ "#/components/schemas/"
+ ):
+ style = None
+ else:
+ style = ">" if "\n" in data or len(data) > 40 else None
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
+
+
def main(output_dir: str):
output_dir = Path(output_dir)
if not output_dir.exists():
@@ -69,7 +79,8 @@ def main(output_dir: str):
y.sequence_dash_offset = 2
y.width = 80
y.allow_unicode = True
- y.explicit_start = True
+ y.representer.add_representer(str, str_presenter)
+
y.dump(
spec.get_json(),
fp,
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 317b895b5..202d3732b 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -4,10 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-import collections
import hashlib
import ipaddress
import typing
+from dataclasses import make_dataclass
from typing import Any, Dict, Set, Union
from ..strong_typing.core import JsonType
@@ -276,6 +276,20 @@ class StatusResponse:
examples: List[Any] = dataclasses.field(default_factory=list)
+def create_docstring_for_request(
+ request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
+) -> str:
+ """Creates a ReST-style docstring for a dynamically generated request dataclass."""
+ lines = ["\n"] # Short description
+
+ # Add parameter documentation in ReST format
+ for name, type_ in fields:
+ desc = doc_params.get(name, "")
+ lines.append(f":param {name}: {desc}")
+
+ return "\n".join(lines)
+
+
class ResponseBuilder:
content_builder: ContentBuilder
@@ -493,11 +507,24 @@ class Generator:
first = next(iter(op.request_params))
request_name, request_type = first
- from dataclasses import make_dataclass
-
op_name = "".join(word.capitalize() for word in op.name.split("_"))
request_name = f"{op_name}Request"
- request_type = make_dataclass(request_name, op.request_params)
+ fields = [
+ (
+ name,
+ type_,
+ )
+ for name, type_ in op.request_params
+ ]
+ request_type = make_dataclass(
+ request_name,
+ fields,
+ namespace={
+ "__doc__": create_docstring_for_request(
+ request_name, fields, doc_params
+ )
+ },
+ )
requestBody = RequestBody(
content={
@@ -650,12 +677,6 @@ class Generator:
)
)
- # types that are produced/consumed by operations
- type_tags = [
- self._build_type_tag(ref, schema)
- for ref, schema in self.schema_builder.schemas.items()
- ]
-
# types that are emitted by events
event_tags: List[Tag] = []
events = get_endpoint_events(self.endpoint)
@@ -682,7 +703,6 @@ class Generator:
# list all operations and types
tags: List[Tag] = []
tags.extend(operation_tags)
- tags.extend(type_tags)
tags.extend(event_tags)
for extra_tag_group in extra_tag_groups.values():
tags.extend(extra_tag_group)
@@ -697,13 +717,6 @@ class Generator:
tags=sorted(tag.name for tag in operation_tags),
)
)
- if type_tags:
- tag_groups.append(
- TagGroup(
- name=self.options.map("Types"),
- tags=sorted(tag.name for tag in type_tags),
- )
- )
if event_tags:
tag_groups.append(
TagGroup(
diff --git a/docs/openapi_generator/strong_typing/schema.py b/docs/openapi_generator/strong_typing/schema.py
index 826efdb4a..f4393041f 100644
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@@ -531,6 +531,7 @@ class JsonSchemaGenerator:
# add property docstring if available
property_doc = property_docstrings.get(property_name)
if property_doc:
+ # print(output_name, property_doc)
property_def.pop("title", None)
property_def["description"] = property_doc
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 5998963d2..0454e22ec 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -190,7 +190,7 @@
"post": {
"responses": {
"200": {
- "description": "Chat completion response. **OR** SSE-stream of these events.",
+ "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
"content": {
"text/event-stream": {
"schema": {
@@ -210,6 +210,7 @@
"tags": [
"Inference"
],
+ "summary": "Generate a chat completion for the given messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@@ -227,7 +228,7 @@
"post": {
"responses": {
"200": {
- "description": "Completion response. **OR** streamed completion response.",
+ "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
"content": {
"text/event-stream": {
"schema": {
@@ -247,6 +248,7 @@
"tags": [
"Inference"
],
+ "summary": "Generate a completion for the given content using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@@ -485,7 +487,7 @@
"post": {
"responses": {
"200": {
- "description": "OK",
+ "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}",
"content": {
"application/json": {
"schema": {
@@ -498,6 +500,7 @@
"tags": [
"Inference"
],
+ "summary": "Generate embeddings for content pieces using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@@ -2334,34 +2337,34 @@
"rows"
]
},
- "BuiltinTool": {
- "type": "string",
- "enum": [
- "brave_search",
- "wolfram_alpha",
- "photogen",
- "code_interpreter"
- ]
- },
"CompletionMessage": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "assistant",
- "default": "assistant"
+ "default": "assistant",
+ "description": "Must be \"assistant\" to identify this as the model's response"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the model's response"
},
"stop_reason": {
- "$ref": "#/components/schemas/StopReason"
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
},
"tool_calls": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolCall"
- }
+ },
+ "description": "List of tool calls. Each tool call is a ToolCall object."
}
},
"additionalProperties": false,
@@ -2370,7 +2373,51 @@
"content",
"stop_reason",
"tool_calls"
- ]
+ ],
+ "title": "A message containing the model's (assistant) response in a chat conversation."
+ },
+ "GrammarResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "grammar",
+ "default": "grammar",
+ "description": "Must be \"grammar\" to identify this format type"
+ },
+ "bnf": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The BNF grammar specification the response should conform to"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "bnf"
+ ],
+ "title": "Configuration for grammar-guided response generation."
},
"GreedySamplingStrategy": {
"type": "object",
@@ -2447,6 +2494,49 @@
}
}
},
+ "JsonSchemaResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "json_schema",
+ "default": "json_schema",
+ "description": "Must be \"json_schema\" to identify this format type"
+ },
+ "json_schema": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "json_schema"
+ ],
+ "title": "Configuration for JSON schema-guided response generation."
+ },
"Message": {
"oneOf": [
{
@@ -2472,29 +2562,28 @@
}
}
},
+ "ResponseFormat": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+ },
+ {
+ "$ref": "#/components/schemas/GrammarResponseFormat"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+ "grammar": "#/components/schemas/GrammarResponseFormat"
+ }
+ }
+ },
"SamplingParams": {
"type": "object",
"properties": {
"strategy": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/GreedySamplingStrategy"
- },
- {
- "$ref": "#/components/schemas/TopPSamplingStrategy"
- },
- {
- "$ref": "#/components/schemas/TopKSamplingStrategy"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "greedy": "#/components/schemas/GreedySamplingStrategy",
- "top_p": "#/components/schemas/TopPSamplingStrategy",
- "top_k": "#/components/schemas/TopKSamplingStrategy"
- }
- }
+ "$ref": "#/components/schemas/SamplingStrategy"
},
"max_tokens": {
"type": "integer",
@@ -2510,13 +2599,26 @@
"strategy"
]
},
- "StopReason": {
- "type": "string",
- "enum": [
- "end_of_turn",
- "end_of_message",
- "out_of_tokens"
- ]
+ "SamplingStrategy": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/GreedySamplingStrategy"
+ },
+ {
+ "$ref": "#/components/schemas/TopPSamplingStrategy"
+ },
+ {
+ "$ref": "#/components/schemas/TopKSamplingStrategy"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "greedy": "#/components/schemas/GreedySamplingStrategy",
+ "top_p": "#/components/schemas/TopPSamplingStrategy",
+ "top_k": "#/components/schemas/TopKSamplingStrategy"
+ }
+ }
},
"SystemMessage": {
"type": "object",
@@ -2524,17 +2626,20 @@
"role": {
"type": "string",
"const": "system",
- "default": "system"
+ "default": "system",
+ "description": "Must be \"system\" to identify this as a system message"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
- ]
+ ],
+ "title": "A system message providing instructions or context to the model."
},
"TextContentItem": {
"type": "object",
@@ -2563,7 +2668,13 @@
"tool_name": {
"oneOf": [
{
- "$ref": "#/components/schemas/BuiltinTool"
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ]
},
{
"type": "string"
@@ -2644,20 +2755,19 @@
"arguments"
]
},
- "ToolChoice": {
- "type": "string",
- "enum": [
- "auto",
- "required"
- ]
- },
"ToolDefinition": {
"type": "object",
"properties": {
"tool_name": {
"oneOf": [
{
- "$ref": "#/components/schemas/BuiltinTool"
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ]
},
{
"type": "string"
@@ -2720,39 +2830,39 @@
"param_type"
]
},
- "ToolPromptFormat": {
- "type": "string",
- "enum": [
- "json",
- "function_tag",
- "python_list"
- ],
- "title": "This Enum refers to the prompt format for calling custom / zero shot tools",
- "description": "`json` --\n Refers to the json format for calling tools.\n The json format takes the form like\n {\n \"type\": \"function\",\n \"function\" : {\n \"name\": \"function_name\",\n \"description\": \"function_description\",\n \"parameters\": {...}\n }\n }\n\n`function_tag` --\n This is an example of how you could define\n your own user defined format for making tool calls.\n The function_tag format looks like this,\n (parameters)\n\nThe detailed prompts for each of these formats are added to llama cli"
- },
"ToolResponseMessage": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "tool",
- "default": "tool"
+ "default": "tool",
+ "description": "Must be \"tool\" to identify this as a tool response"
},
"call_id": {
- "type": "string"
+ "type": "string",
+ "description": "Unique identifier for the tool call this response is for"
},
"tool_name": {
"oneOf": [
{
- "$ref": "#/components/schemas/BuiltinTool"
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ]
},
{
"type": "string"
}
- ]
+ ],
+ "description": "Name of the tool that was called"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The response content from the tool"
}
},
"additionalProperties": false,
@@ -2761,7 +2871,8 @@
"call_id",
"tool_name",
"content"
- ]
+ ],
+ "title": "A message representing the result of a tool invocation."
},
"TopKSamplingStrategy": {
"type": "object",
@@ -2820,20 +2931,24 @@
"role": {
"type": "string",
"const": "user",
- "default": "user"
+ "default": "user",
+ "description": "Must be \"user\" to identify this as a user message"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the message, which can include text and other media"
},
"context": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
- ]
+ ],
+ "title": "A message from the user in a chat conversation."
},
"BatchChatCompletionRequest": {
"type": "object",
@@ -2860,17 +2975,32 @@
}
},
"tool_choice": {
- "$ref": "#/components/schemas/ToolChoice"
+ "type": "string",
+ "enum": [
+ "auto",
+ "required"
+ ],
+ "title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
},
"tool_prompt_format": {
- "$ref": "#/components/schemas/ToolPromptFormat"
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "title": "Prompt format for calling custom / zero shot tools."
+ },
+ "response_format": {
+ "$ref": "#/components/schemas/ResponseFormat"
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
- "default": 0
+ "default": 0,
+ "description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false
@@ -2885,18 +3015,56 @@
"BatchChatCompletionResponse": {
"type": "object",
"properties": {
- "completion_message_batch": {
+ "batch": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/CompletionMessage"
+ "$ref": "#/components/schemas/ChatCompletionResponse"
}
}
},
"additionalProperties": false,
"required": [
- "completion_message_batch"
+ "batch"
]
},
+ "ChatCompletionResponse": {
+ "type": "object",
+ "properties": {
+ "completion_message": {
+ "$ref": "#/components/schemas/CompletionMessage",
+ "description": "The complete response message"
+ },
+ "logprobs": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/TokenLogProbs"
+ },
+ "description": "Optional log probabilities for generated tokens"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "completion_message"
+ ],
+ "title": "Response from a chat completion request."
+ },
+ "TokenLogProbs": {
+ "type": "object",
+ "properties": {
+ "logprobs_by_token": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "number"
+ },
+ "description": "Dictionary mapping tokens to their log probabilities"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "logprobs_by_token"
+ ],
+ "title": "Log probabilities for generated tokens."
+ },
"BatchCompletionRequest": {
"type": "object",
"properties": {
@@ -2912,12 +3080,16 @@
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams"
},
+ "response_format": {
+ "$ref": "#/components/schemas/ResponseFormat"
+ },
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
- "default": 0
+ "default": 0,
+ "description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false
@@ -2932,18 +3104,49 @@
"BatchCompletionResponse": {
"type": "object",
"properties": {
- "completion_message_batch": {
+ "batch": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/CompletionMessage"
+ "$ref": "#/components/schemas/CompletionResponse"
}
}
},
"additionalProperties": false,
"required": [
- "completion_message_batch"
+ "batch"
]
},
+ "CompletionResponse": {
+ "type": "object",
+ "properties": {
+ "content": {
+ "type": "string",
+ "description": "The generated completion text"
+ },
+ "stop_reason": {
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Reason why generation stopped"
+ },
+ "logprobs": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/TokenLogProbs"
+ },
+ "description": "Optional log probabilities for generated tokens"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "content",
+ "stop_reason"
+ ],
+ "title": "Response from a completion request."
+ },
"CancelTrainingJobRequest": {
"type": "object",
"properties": {
@@ -2956,145 +3159,67 @@
"job_uuid"
]
},
- "GrammarResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "grammar",
- "default": "grammar"
- },
- "bnf": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "bnf"
- ]
- },
- "JsonSchemaResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "json_schema",
- "default": "json_schema"
- },
- "json_schema": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "json_schema"
- ]
- },
- "ResponseFormat": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/JsonSchemaResponseFormat"
- },
- {
- "$ref": "#/components/schemas/GrammarResponseFormat"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
- "grammar": "#/components/schemas/GrammarResponseFormat"
- }
- }
- },
"ChatCompletionRequest": {
"type": "object",
"properties": {
"model_id": {
- "type": "string"
+ "type": "string",
+ "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"messages": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
- }
+ },
+ "description": "List of messages in the conversation"
},
"sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "Parameters to control the sampling strategy"
},
"tools": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolDefinition"
- }
+ },
+ "description": "(Optional) List of tool definitions available to the model"
},
"tool_choice": {
- "$ref": "#/components/schemas/ToolChoice"
+ "type": "string",
+ "enum": [
+ "auto",
+ "required"
+ ],
+ "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
},
"tool_prompt_format": {
- "$ref": "#/components/schemas/ToolPromptFormat"
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
},
"response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
+ "$ref": "#/components/schemas/ResponseFormat",
+ "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
},
"stream": {
- "type": "boolean"
+ "type": "boolean",
+ "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
- "default": 0
+ "default": 0,
+ "description": "How many tokens (for each position) to return log probabilities for."
}
},
- "additionalProperties": false
+ "additionalProperties": false,
+ "description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
@@ -3103,42 +3228,37 @@
"messages"
]
},
- "ChatCompletionResponse": {
- "type": "object",
- "properties": {
- "completion_message": {
- "$ref": "#/components/schemas/CompletionMessage"
- },
- "logprobs": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/TokenLogProbs"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "completion_message"
- ],
- "title": "Chat completion response."
- },
"ChatCompletionResponseEvent": {
"type": "object",
"properties": {
"event_type": {
- "$ref": "#/components/schemas/ChatCompletionResponseEventType"
+ "type": "string",
+ "enum": [
+ "start",
+ "complete",
+ "progress"
+ ],
+ "description": "Type of the event"
},
"delta": {
- "$ref": "#/components/schemas/ContentDelta"
+ "$ref": "#/components/schemas/ContentDelta",
+ "description": "Content generated since last event. This can be one or more tokens, or a tool call."
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
- }
+ },
+ "description": "Optional log probabilities for generated tokens"
},
"stop_reason": {
- "$ref": "#/components/schemas/StopReason"
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Optional reason why generation stopped, if complete"
}
},
"additionalProperties": false,
@@ -3146,28 +3266,21 @@
"event_type",
"delta"
],
- "title": "Chat completion response event."
- },
- "ChatCompletionResponseEventType": {
- "type": "string",
- "enum": [
- "start",
- "complete",
- "progress"
- ]
+ "title": "An event during chat completion generation."
},
"ChatCompletionResponseStreamChunk": {
"type": "object",
"properties": {
"event": {
- "$ref": "#/components/schemas/ChatCompletionResponseEvent"
+ "$ref": "#/components/schemas/ChatCompletionResponseEvent",
+ "description": "The event containing the new content"
}
},
"additionalProperties": false,
"required": [
"event"
],
- "title": "SSE-stream of these events."
+ "title": "A chunk of a streamed chat completion response."
},
"ContentDelta": {
"oneOf": [
@@ -3227,21 +3340,6 @@
"text"
]
},
- "TokenLogProbs": {
- "type": "object",
- "properties": {
- "logprobs_by_token": {
- "type": "object",
- "additionalProperties": {
- "type": "number"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "logprobs_by_token"
- ]
- },
"ToolCallDelta": {
"type": "object",
"properties": {
@@ -3261,7 +3359,13 @@
]
},
"parse_status": {
- "$ref": "#/components/schemas/ToolCallParseStatus"
+ "type": "string",
+ "enum": [
+ "started",
+ "in_progress",
+ "failed",
+ "succeeded"
+ ]
}
},
"additionalProperties": false,
@@ -3271,42 +3375,40 @@
"parse_status"
]
},
- "ToolCallParseStatus": {
- "type": "string",
- "enum": [
- "started",
- "in_progress",
- "failed",
- "succeeded"
- ]
- },
"CompletionRequest": {
"type": "object",
"properties": {
"model_id": {
- "type": "string"
+ "type": "string",
+ "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content to generate a completion for"
},
"sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "(Optional) Parameters to control the sampling strategy"
},
"response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
+ "$ref": "#/components/schemas/ResponseFormat",
+ "description": "(Optional) Grammar specification for guided (structured) decoding"
},
"stream": {
- "type": "boolean"
+ "type": "boolean",
+ "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
- "default": 0
+ "default": 0,
+ "description": "How many tokens (for each position) to return log probabilities for."
}
},
- "additionalProperties": false
+ "additionalProperties": false,
+ "description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
@@ -3315,50 +3417,35 @@
"content"
]
},
- "CompletionResponse": {
- "type": "object",
- "properties": {
- "content": {
- "type": "string"
- },
- "stop_reason": {
- "$ref": "#/components/schemas/StopReason"
- },
- "logprobs": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/TokenLogProbs"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "content",
- "stop_reason"
- ],
- "title": "Completion response."
- },
"CompletionResponseStreamChunk": {
"type": "object",
"properties": {
"delta": {
- "type": "string"
+ "type": "string",
+ "description": "New content generated since last chunk. This can be one or more tokens."
},
"stop_reason": {
- "$ref": "#/components/schemas/StopReason"
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Optional reason why generation stopped, if complete"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
- }
+ },
+ "description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"delta"
],
- "title": "streamed completion response."
+ "title": "A chunk of a streamed completion response."
},
"AgentConfig": {
"type": "object",
@@ -3391,11 +3478,22 @@
}
},
"tool_choice": {
- "$ref": "#/components/schemas/ToolChoice",
+ "type": "string",
+ "enum": [
+ "auto",
+ "required"
+ ],
+ "title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.",
"default": "auto"
},
"tool_prompt_format": {
- "$ref": "#/components/schemas/ToolPromptFormat"
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "title": "Prompt format for calling custom / zero shot tools."
},
"max_infer_iters": {
"type": "integer",
@@ -4102,7 +4200,13 @@
"tool_name": {
"oneOf": [
{
- "$ref": "#/components/schemas/BuiltinTool"
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ]
},
{
"type": "string"
@@ -4241,13 +4345,15 @@
"type": "object",
"properties": {
"model_id": {
- "type": "string"
+ "type": "string",
+ "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
},
"contents": {
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContent"
- }
+ },
+ "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
}
},
"additionalProperties": false,
@@ -4266,13 +4372,15 @@
"items": {
"type": "number"
}
- }
+ },
+ "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
}
},
"additionalProperties": false,
"required": [
"embeddings"
- ]
+ ],
+ "title": "Response containing generated embeddings."
},
"AgentCandidate": {
"type": "object",
@@ -7732,824 +7840,68 @@
],
"tags": [
{
- "name": "AgentCandidate",
- "description": ""
- },
- {
- "name": "AgentConfig",
- "description": ""
- },
- {
- "name": "AgentCreateResponse",
- "description": ""
- },
- {
- "name": "AgentSessionCreateResponse",
- "description": ""
- },
- {
- "name": "AgentStepResponse",
- "description": ""
- },
- {
- "name": "AgentTool",
- "description": ""
- },
- {
- "name": "AgentTurnInputType",
- "description": ""
- },
- {
- "name": "AgentTurnResponseEvent",
- "description": ""
- },
- {
- "name": "AgentTurnResponseEventPayload",
- "description": ""
- },
- {
- "name": "AgentTurnResponseStepCompletePayload",
- "description": ""
- },
- {
- "name": "AgentTurnResponseStepProgressPayload",
- "description": ""
- },
- {
- "name": "AgentTurnResponseStepStartPayload",
- "description": ""
- },
- {
- "name": "AgentTurnResponseStreamChunk",
- "description": "streamed agent turn completion response."
- },
- {
- "name": "AgentTurnResponseTurnCompletePayload",
- "description": ""
- },
- {
- "name": "AgentTurnResponseTurnStartPayload",
- "description": ""
- },
- {
- "name": "Agents"
- },
- {
- "name": "AggregationFunctionType",
- "description": ""
- },
- {
- "name": "AlgorithmConfig",
- "description": ""
- },
- {
- "name": "AppEvalTaskConfig",
- "description": ""
- },
- {
- "name": "AppendRowsRequest",
- "description": ""
- },
- {
- "name": "ArrayType",
- "description": ""
- },
- {
- "name": "BasicScoringFnParams",
- "description": ""
- },
- {
- "name": "BatchChatCompletionRequest",
- "description": ""
- },
- {
- "name": "BatchChatCompletionResponse",
- "description": ""
- },
- {
- "name": "BatchCompletionRequest",
- "description": ""
- },
- {
- "name": "BatchCompletionResponse",
- "description": ""
+ "name": "Agents",
+ "description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
+ "x-displayName": "Agents API for creating and interacting with agentic systems."
},
{
"name": "BatchInference (Coming Soon)"
},
- {
- "name": "BenchmarkEvalTaskConfig",
- "description": ""
- },
- {
- "name": "BooleanType",
- "description": ""
- },
- {
- "name": "BuiltinTool",
- "description": ""
- },
- {
- "name": "CancelTrainingJobRequest",
- "description": ""
- },
- {
- "name": "ChatCompletionInputType",
- "description": ""
- },
- {
- "name": "ChatCompletionRequest",
- "description": ""
- },
- {
- "name": "ChatCompletionResponse",
- "description": "Chat completion response."
- },
- {
- "name": "ChatCompletionResponseEvent",
- "description": "Chat completion response event."
- },
- {
- "name": "ChatCompletionResponseEventType",
- "description": ""
- },
- {
- "name": "ChatCompletionResponseStreamChunk",
- "description": "SSE-stream of these events."
- },
- {
- "name": "Checkpoint",
- "description": "Checkpoint created during training runs"
- },
- {
- "name": "CompletionInputType",
- "description": ""
- },
- {
- "name": "CompletionMessage",
- "description": ""
- },
- {
- "name": "CompletionRequest",
- "description": ""
- },
- {
- "name": "CompletionResponse",
- "description": "Completion response."
- },
- {
- "name": "CompletionResponseStreamChunk",
- "description": "streamed completion response."
- },
- {
- "name": "ContentDelta",
- "description": ""
- },
- {
- "name": "CreateAgentRequest",
- "description": ""
- },
- {
- "name": "CreateAgentSessionRequest",
- "description": ""
- },
- {
- "name": "CreateAgentTurnRequest",
- "description": ""
- },
- {
- "name": "DPOAlignmentConfig",
- "description": ""
- },
- {
- "name": "DataConfig",
- "description": ""
- },
- {
- "name": "Dataset",
- "description": ""
- },
- {
- "name": "DatasetFormat",
- "description": ""
- },
{
"name": "DatasetIO"
},
{
"name": "Datasets"
},
- {
- "name": "DefaultRAGQueryGeneratorConfig",
- "description": ""
- },
- {
- "name": "EfficiencyConfig",
- "description": ""
- },
- {
- "name": "EmbeddingsRequest",
- "description": ""
- },
- {
- "name": "EmbeddingsResponse",
- "description": ""
- },
{
"name": "Eval"
},
- {
- "name": "EvalCandidate",
- "description": ""
- },
- {
- "name": "EvalTask",
- "description": ""
- },
- {
- "name": "EvalTaskConfig",
- "description": ""
- },
{
"name": "EvalTasks"
},
{
- "name": "EvaluateResponse",
- "description": ""
- },
- {
- "name": "EvaluateRowsRequest",
- "description": ""
- },
- {
- "name": "Event",
- "description": ""
- },
- {
- "name": "GrammarResponseFormat",
- "description": ""
- },
- {
- "name": "GreedySamplingStrategy",
- "description": ""
- },
- {
- "name": "HealthInfo",
- "description": ""
- },
- {
- "name": "ImageContentItem",
- "description": ""
- },
- {
- "name": "ImageDelta",
- "description": ""
- },
- {
- "name": "Inference"
- },
- {
- "name": "InferenceStep",
- "description": ""
- },
- {
- "name": "InsertChunksRequest",
- "description": ""
- },
- {
- "name": "InsertRequest",
- "description": ""
+ "name": "Inference",
+ "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
+ "x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
},
{
"name": "Inspect"
},
- {
- "name": "InterleavedContent",
- "description": ""
- },
- {
- "name": "InterleavedContentItem",
- "description": ""
- },
- {
- "name": "InvokeToolRequest",
- "description": ""
- },
- {
- "name": "Job",
- "description": ""
- },
- {
- "name": "JobStatus",
- "description": ""
- },
- {
- "name": "JsonSchemaResponseFormat",
- "description": ""
- },
- {
- "name": "JsonType",
- "description": ""
- },
- {
- "name": "LLMAsJudgeScoringFnParams",
- "description": ""
- },
- {
- "name": "LLMRAGQueryGeneratorConfig",
- "description": ""
- },
- {
- "name": "ListDatasetsResponse",
- "description": ""
- },
- {
- "name": "ListEvalTasksResponse",
- "description": ""
- },
- {
- "name": "ListModelsResponse",
- "description": ""
- },
- {
- "name": "ListPostTrainingJobsResponse",
- "description": ""
- },
- {
- "name": "ListProvidersResponse",
- "description": ""
- },
- {
- "name": "ListRoutesResponse",
- "description": ""
- },
- {
- "name": "ListScoringFunctionsResponse",
- "description": ""
- },
- {
- "name": "ListShieldsResponse",
- "description": ""
- },
- {
- "name": "ListToolGroupsResponse",
- "description": ""
- },
- {
- "name": "ListToolsResponse",
- "description": ""
- },
- {
- "name": "ListVectorDBsResponse",
- "description": ""
- },
- {
- "name": "LogEventRequest",
- "description": ""
- },
- {
- "name": "LogSeverity",
- "description": ""
- },
- {
- "name": "LoraFinetuningConfig",
- "description": ""
- },
- {
- "name": "MemoryRetrievalStep",
- "description": ""
- },
- {
- "name": "Message",
- "description": ""
- },
- {
- "name": "MetricEvent",
- "description": ""
- },
- {
- "name": "Model",
- "description": ""
- },
- {
- "name": "ModelCandidate",
- "description": ""
- },
- {
- "name": "ModelType",
- "description": ""
- },
{
"name": "Models"
},
- {
- "name": "NumberType",
- "description": ""
- },
- {
- "name": "ObjectType",
- "description": ""
- },
- {
- "name": "OptimizerConfig",
- "description": ""
- },
- {
- "name": "OptimizerType",
- "description": ""
- },
- {
- "name": "PaginatedRowsResult",
- "description": ""
- },
- {
- "name": "ParamType",
- "description": ""
- },
{
"name": "PostTraining (Coming Soon)"
},
- {
- "name": "PostTrainingJob",
- "description": ""
- },
- {
- "name": "PostTrainingJobArtifactsResponse",
- "description": "Artifacts of a finetuning job."
- },
- {
- "name": "PostTrainingJobStatusResponse",
- "description": "Status of a finetuning job."
- },
- {
- "name": "PreferenceOptimizeRequest",
- "description": ""
- },
- {
- "name": "ProviderInfo",
- "description": ""
- },
- {
- "name": "QATFinetuningConfig",
- "description": ""
- },
- {
- "name": "QueryChunksRequest",
- "description": ""
- },
- {
- "name": "QueryChunksResponse",
- "description": ""
- },
- {
- "name": "QueryCondition",
- "description": ""
- },
- {
- "name": "QueryConditionOp",
- "description": ""
- },
- {
- "name": "QueryRequest",
- "description": ""
- },
- {
- "name": "QuerySpanTreeResponse",
- "description": ""
- },
- {
- "name": "QuerySpansResponse",
- "description": ""
- },
- {
- "name": "QueryTracesResponse",
- "description": ""
- },
- {
- "name": "RAGDocument",
- "description": ""
- },
- {
- "name": "RAGQueryConfig",
- "description": ""
- },
- {
- "name": "RAGQueryGeneratorConfig",
- "description": ""
- },
- {
- "name": "RAGQueryResult",
- "description": ""
- },
- {
- "name": "RegexParserScoringFnParams",
- "description": ""
- },
- {
- "name": "RegisterDatasetRequest",
- "description": ""
- },
- {
- "name": "RegisterEvalTaskRequest",
- "description": ""
- },
- {
- "name": "RegisterModelRequest",
- "description": ""
- },
- {
- "name": "RegisterScoringFunctionRequest",
- "description": ""
- },
- {
- "name": "RegisterShieldRequest",
- "description": ""
- },
- {
- "name": "RegisterToolGroupRequest",
- "description": ""
- },
- {
- "name": "RegisterVectorDbRequest",
- "description": ""
- },
- {
- "name": "ResponseFormat",
- "description": ""
- },
- {
- "name": "RouteInfo",
- "description": ""
- },
- {
- "name": "RunEvalRequest",
- "description": ""
- },
- {
- "name": "RunShieldRequest",
- "description": ""
- },
- {
- "name": "RunShieldResponse",
- "description": ""
- },
{
"name": "Safety"
},
- {
- "name": "SafetyViolation",
- "description": ""
- },
- {
- "name": "SamplingParams",
- "description": ""
- },
- {
- "name": "SaveSpansToDatasetRequest",
- "description": ""
- },
- {
- "name": "ScoreBatchRequest",
- "description": ""
- },
- {
- "name": "ScoreBatchResponse",
- "description": ""
- },
- {
- "name": "ScoreRequest",
- "description": ""
- },
- {
- "name": "ScoreResponse",
- "description": ""
- },
{
"name": "Scoring"
},
- {
- "name": "ScoringFn",
- "description": ""
- },
- {
- "name": "ScoringFnParams",
- "description": ""
- },
{
"name": "ScoringFunctions"
},
- {
- "name": "ScoringResult",
- "description": ""
- },
- {
- "name": "Session",
- "description": "A single session of an interaction with an Agentic System."
- },
- {
- "name": "Shield",
- "description": "A safety shield resource that can be used to check content"
- },
- {
- "name": "ShieldCallStep",
- "description": ""
- },
{
"name": "Shields"
},
- {
- "name": "Span",
- "description": ""
- },
- {
- "name": "SpanEndPayload",
- "description": ""
- },
- {
- "name": "SpanStartPayload",
- "description": ""
- },
- {
- "name": "SpanStatus",
- "description": ""
- },
- {
- "name": "SpanWithStatus",
- "description": ""
- },
- {
- "name": "StopReason",
- "description": ""
- },
- {
- "name": "StringType",
- "description": ""
- },
- {
- "name": "StructuredLogEvent",
- "description": ""
- },
- {
- "name": "StructuredLogPayload",
- "description": ""
- },
- {
- "name": "SupervisedFineTuneRequest",
- "description": ""
- },
- {
- "name": "SyntheticDataGenerateRequest",
- "description": ""
- },
{
"name": "SyntheticDataGeneration (Coming Soon)"
},
- {
- "name": "SyntheticDataGenerationResponse",
- "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
- },
- {
- "name": "SystemMessage",
- "description": ""
- },
{
"name": "Telemetry"
},
- {
- "name": "TextContentItem",
- "description": ""
- },
- {
- "name": "TextDelta",
- "description": ""
- },
- {
- "name": "TokenLogProbs",
- "description": ""
- },
- {
- "name": "Tool",
- "description": ""
- },
- {
- "name": "ToolCall",
- "description": ""
- },
- {
- "name": "ToolCallDelta",
- "description": ""
- },
- {
- "name": "ToolCallParseStatus",
- "description": ""
- },
- {
- "name": "ToolChoice",
- "description": ""
- },
- {
- "name": "ToolDef",
- "description": ""
- },
- {
- "name": "ToolDefinition",
- "description": ""
- },
- {
- "name": "ToolExecutionStep",
- "description": ""
- },
- {
- "name": "ToolGroup",
- "description": ""
- },
{
"name": "ToolGroups"
},
- {
- "name": "ToolHost",
- "description": ""
- },
- {
- "name": "ToolInvocationResult",
- "description": ""
- },
- {
- "name": "ToolParamDefinition",
- "description": ""
- },
- {
- "name": "ToolParameter",
- "description": ""
- },
- {
- "name": "ToolPromptFormat",
- "description": "This Enum refers to the prompt format for calling custom / zero shot tools\n\n`json` --\n Refers to the json format for calling tools.\n The json format takes the form like\n {\n \"type\": \"function\",\n \"function\" : {\n \"name\": \"function_name\",\n \"description\": \"function_description\",\n \"parameters\": {...}\n }\n }\n\n`function_tag` --\n This is an example of how you could define\n your own user defined format for making tool calls.\n The function_tag format looks like this,\n (parameters)\n\nThe detailed prompts for each of these formats are added to llama cli"
- },
- {
- "name": "ToolResponse",
- "description": ""
- },
- {
- "name": "ToolResponseMessage",
- "description": ""
- },
{
"name": "ToolRuntime"
},
- {
- "name": "TopKSamplingStrategy",
- "description": ""
- },
- {
- "name": "TopPSamplingStrategy",
- "description": ""
- },
- {
- "name": "Trace",
- "description": ""
- },
- {
- "name": "TrainingConfig",
- "description": ""
- },
- {
- "name": "Turn",
- "description": "A single turn in an interaction with an Agentic System."
- },
- {
- "name": "URL",
- "description": ""
- },
- {
- "name": "UnionType",
- "description": ""
- },
- {
- "name": "UnstructuredLogEvent",
- "description": ""
- },
- {
- "name": "UserMessage",
- "description": ""
- },
- {
- "name": "VectorDB",
- "description": ""
- },
{
"name": "VectorDBs"
},
{
"name": "VectorIO"
- },
- {
- "name": "VersionInfo",
- "description": ""
- },
- {
- "name": "ViolationLevel",
- "description": ""
}
],
"x-tagGroups": [
@@ -8577,201 +7929,6 @@
"VectorDBs",
"VectorIO"
]
- },
- {
- "name": "Types",
- "tags": [
- "AgentCandidate",
- "AgentConfig",
- "AgentCreateResponse",
- "AgentSessionCreateResponse",
- "AgentStepResponse",
- "AgentTool",
- "AgentTurnInputType",
- "AgentTurnResponseEvent",
- "AgentTurnResponseEventPayload",
- "AgentTurnResponseStepCompletePayload",
- "AgentTurnResponseStepProgressPayload",
- "AgentTurnResponseStepStartPayload",
- "AgentTurnResponseStreamChunk",
- "AgentTurnResponseTurnCompletePayload",
- "AgentTurnResponseTurnStartPayload",
- "AggregationFunctionType",
- "AlgorithmConfig",
- "AppEvalTaskConfig",
- "AppendRowsRequest",
- "ArrayType",
- "BasicScoringFnParams",
- "BatchChatCompletionRequest",
- "BatchChatCompletionResponse",
- "BatchCompletionRequest",
- "BatchCompletionResponse",
- "BenchmarkEvalTaskConfig",
- "BooleanType",
- "BuiltinTool",
- "CancelTrainingJobRequest",
- "ChatCompletionInputType",
- "ChatCompletionRequest",
- "ChatCompletionResponse",
- "ChatCompletionResponseEvent",
- "ChatCompletionResponseEventType",
- "ChatCompletionResponseStreamChunk",
- "Checkpoint",
- "CompletionInputType",
- "CompletionMessage",
- "CompletionRequest",
- "CompletionResponse",
- "CompletionResponseStreamChunk",
- "ContentDelta",
- "CreateAgentRequest",
- "CreateAgentSessionRequest",
- "CreateAgentTurnRequest",
- "DPOAlignmentConfig",
- "DataConfig",
- "Dataset",
- "DatasetFormat",
- "DefaultRAGQueryGeneratorConfig",
- "EfficiencyConfig",
- "EmbeddingsRequest",
- "EmbeddingsResponse",
- "EvalCandidate",
- "EvalTask",
- "EvalTaskConfig",
- "EvaluateResponse",
- "EvaluateRowsRequest",
- "Event",
- "GrammarResponseFormat",
- "GreedySamplingStrategy",
- "HealthInfo",
- "ImageContentItem",
- "ImageDelta",
- "InferenceStep",
- "InsertChunksRequest",
- "InsertRequest",
- "InterleavedContent",
- "InterleavedContentItem",
- "InvokeToolRequest",
- "Job",
- "JobStatus",
- "JsonSchemaResponseFormat",
- "JsonType",
- "LLMAsJudgeScoringFnParams",
- "LLMRAGQueryGeneratorConfig",
- "ListDatasetsResponse",
- "ListEvalTasksResponse",
- "ListModelsResponse",
- "ListPostTrainingJobsResponse",
- "ListProvidersResponse",
- "ListRoutesResponse",
- "ListScoringFunctionsResponse",
- "ListShieldsResponse",
- "ListToolGroupsResponse",
- "ListToolsResponse",
- "ListVectorDBsResponse",
- "LogEventRequest",
- "LogSeverity",
- "LoraFinetuningConfig",
- "MemoryRetrievalStep",
- "Message",
- "MetricEvent",
- "Model",
- "ModelCandidate",
- "ModelType",
- "NumberType",
- "ObjectType",
- "OptimizerConfig",
- "OptimizerType",
- "PaginatedRowsResult",
- "ParamType",
- "PostTrainingJob",
- "PostTrainingJobArtifactsResponse",
- "PostTrainingJobStatusResponse",
- "PreferenceOptimizeRequest",
- "ProviderInfo",
- "QATFinetuningConfig",
- "QueryChunksRequest",
- "QueryChunksResponse",
- "QueryCondition",
- "QueryConditionOp",
- "QueryRequest",
- "QuerySpanTreeResponse",
- "QuerySpansResponse",
- "QueryTracesResponse",
- "RAGDocument",
- "RAGQueryConfig",
- "RAGQueryGeneratorConfig",
- "RAGQueryResult",
- "RegexParserScoringFnParams",
- "RegisterDatasetRequest",
- "RegisterEvalTaskRequest",
- "RegisterModelRequest",
- "RegisterScoringFunctionRequest",
- "RegisterShieldRequest",
- "RegisterToolGroupRequest",
- "RegisterVectorDbRequest",
- "ResponseFormat",
- "RouteInfo",
- "RunEvalRequest",
- "RunShieldRequest",
- "RunShieldResponse",
- "SafetyViolation",
- "SamplingParams",
- "SaveSpansToDatasetRequest",
- "ScoreBatchRequest",
- "ScoreBatchResponse",
- "ScoreRequest",
- "ScoreResponse",
- "ScoringFn",
- "ScoringFnParams",
- "ScoringResult",
- "Session",
- "Shield",
- "ShieldCallStep",
- "Span",
- "SpanEndPayload",
- "SpanStartPayload",
- "SpanStatus",
- "SpanWithStatus",
- "StopReason",
- "StringType",
- "StructuredLogEvent",
- "StructuredLogPayload",
- "SupervisedFineTuneRequest",
- "SyntheticDataGenerateRequest",
- "SyntheticDataGenerationResponse",
- "SystemMessage",
- "TextContentItem",
- "TextDelta",
- "TokenLogProbs",
- "Tool",
- "ToolCall",
- "ToolCallDelta",
- "ToolCallParseStatus",
- "ToolChoice",
- "ToolDef",
- "ToolDefinition",
- "ToolExecutionStep",
- "ToolGroup",
- "ToolHost",
- "ToolInvocationResult",
- "ToolParamDefinition",
- "ToolParameter",
- "ToolPromptFormat",
- "ToolResponse",
- "ToolResponseMessage",
- "TopKSamplingStrategy",
- "TopPSamplingStrategy",
- "Trace",
- "TrainingConfig",
- "Turn",
- "URL",
- "UnionType",
- "UnstructuredLogEvent",
- "UserMessage",
- "VectorDB",
- "VersionInfo",
- "ViolationLevel"
- ]
}
]
};
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 1d7c4f113..0734ef236 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1,11 +1,12 @@
----
openapi: 3.1.0
info:
title: Llama Stack Specification
version: v1
- description: "This is the specification of the Llama Stack that provides\n \
- \ a set of endpoints and their corresponding interfaces that are tailored
- to\n best leverage Llama Models."
+ description: >-
+ This is the specification of the Llama Stack that provides
+ a set of endpoints and their corresponding interfaces that are
+ tailored to
+ best leverage Llama Models.
servers:
- url: http://any-hosted-llama-stack.com
paths:
@@ -108,7 +109,9 @@ paths:
post:
responses:
'200':
- description: Chat completion response. **OR** SSE-stream of these events.
+ description: >-
+ If stream=False, returns a ChatCompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
content:
text/event-stream:
schema:
@@ -117,6 +120,8 @@ paths:
- $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
tags:
- Inference
+ summary: >-
+ Generate a chat completion for the given messages using the specified model.
parameters: []
requestBody:
content:
@@ -128,7 +133,9 @@ paths:
post:
responses:
'200':
- description: Completion response. **OR** streamed completion response.
+ description: >-
+ If stream=False, returns a CompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
content:
text/event-stream:
schema:
@@ -137,6 +144,8 @@ paths:
- $ref: '#/components/schemas/CompletionResponseStreamChunk'
tags:
- Inference
+ summary: >-
+ Generate a completion for the given content using the specified model.
parameters: []
requestBody:
content:
@@ -189,8 +198,9 @@ paths:
post:
responses:
'200':
- description: A single turn in an interaction with an Agentic System. **OR**
- streamed agent turn completion response.
+ description: >-
+ A single turn in an interaction with an Agentic System. **OR** streamed
+ agent turn completion response.
content:
text/event-stream:
schema:
@@ -279,13 +289,18 @@ paths:
post:
responses:
'200':
- description: OK
+ description: >-
+ An array of embeddings, one for each content. Each embedding is a list
+ of floats. The dimensionality of the embedding is model-specific; you
+ can check model metadata using /models/{model_id}
content:
application/json:
schema:
$ref: '#/components/schemas/EmbeddingsResponse'
tags:
- Inference
+ summary: >-
+ Generate embeddings for content pieces using the specified model.
parameters: []
requestBody:
content:
@@ -709,7 +724,8 @@ paths:
description: OK
tags:
- ToolRuntime
- summary: Index documents so they can be used by the RAG system
+ summary: >-
+ Index documents so they can be used by the RAG system
parameters: []
requestBody:
content:
@@ -1109,7 +1125,8 @@ paths:
$ref: '#/components/schemas/RAGQueryResult'
tags:
- ToolRuntime
- summary: Query the RAG system for context; typically invoked by the agent
+ summary: >-
+ Query the RAG system for context; typically invoked by the agent
parameters: []
requestBody:
content:
@@ -1341,7 +1358,8 @@ paths:
tags:
- Inspect
parameters: []
-jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
+jsonSchemaDialect: >-
+ https://json-schema.org/draft/2020-12/schema
components:
schemas:
AppendRowsRequest:
@@ -1365,13 +1383,6 @@ components:
required:
- dataset_id
- rows
- BuiltinTool:
- type: string
- enum:
- - brave_search
- - wolfram_alpha
- - photogen
- - code_interpreter
CompletionMessage:
type: object
properties:
@@ -1379,20 +1390,65 @@ components:
type: string
const: assistant
default: assistant
+ description: >-
+ Must be "assistant" to identify this as the model's response
content:
$ref: '#/components/schemas/InterleavedContent'
+ description: The content of the model's response
stop_reason:
- $ref: '#/components/schemas/StopReason'
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+ The model finished generating the entire response. - `StopReason.end_of_message`:
+ The model finished generating but generated a partial response -- usually,
+ a tool call. The user may call the tool and continue the conversation
+ with the tool's response. - `StopReason.out_of_tokens`: The model ran
+ out of token budget.
tool_calls:
type: array
items:
$ref: '#/components/schemas/ToolCall'
+ description: >-
+ List of tool calls. Each tool call is a ToolCall object.
additionalProperties: false
required:
- role
- content
- stop_reason
- tool_calls
+ title: >-
+ A message containing the model's (assistant) response in a chat conversation.
+ GrammarResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ const: grammar
+ default: grammar
+ description: >-
+ Must be "grammar" to identify this format type
+ bnf:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The BNF grammar specification the response should conform to
+ additionalProperties: false
+ required:
+ - type
+ - bnf
+ title: >-
+ Configuration for grammar-guided response generation.
GreedySamplingStrategy:
type: object
properties:
@@ -1439,6 +1495,34 @@ components:
mapping:
image: '#/components/schemas/ImageContentItem'
text: '#/components/schemas/TextContentItem'
+ JsonSchemaResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ const: json_schema
+ default: json_schema
+ description: >-
+ Must be "json_schema" to identify this format type
+ json_schema:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The JSON schema the response should conform to. In a Python SDK, this
+ is often a `pydantic` model.
+ additionalProperties: false
+ required:
+ - type
+ - json_schema
+ title: >-
+ Configuration for JSON schema-guided response generation.
Message:
oneOf:
- $ref: '#/components/schemas/UserMessage'
@@ -1452,20 +1536,20 @@ components:
system: '#/components/schemas/SystemMessage'
tool: '#/components/schemas/ToolResponseMessage'
assistant: '#/components/schemas/CompletionMessage'
+ ResponseFormat:
+ oneOf:
+ - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+ - $ref: '#/components/schemas/GrammarResponseFormat'
+ discriminator:
+ propertyName: type
+ mapping:
+ json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+ grammar: '#/components/schemas/GrammarResponseFormat'
SamplingParams:
type: object
properties:
strategy:
- oneOf:
- - $ref: '#/components/schemas/GreedySamplingStrategy'
- - $ref: '#/components/schemas/TopPSamplingStrategy'
- - $ref: '#/components/schemas/TopKSamplingStrategy'
- discriminator:
- propertyName: type
- mapping:
- greedy: '#/components/schemas/GreedySamplingStrategy'
- top_p: '#/components/schemas/TopPSamplingStrategy'
- top_k: '#/components/schemas/TopKSamplingStrategy'
+ $ref: '#/components/schemas/SamplingStrategy'
max_tokens:
type: integer
default: 0
@@ -1475,12 +1559,17 @@ components:
additionalProperties: false
required:
- strategy
- StopReason:
- type: string
- enum:
- - end_of_turn
- - end_of_message
- - out_of_tokens
+ SamplingStrategy:
+ oneOf:
+ - $ref: '#/components/schemas/GreedySamplingStrategy'
+ - $ref: '#/components/schemas/TopPSamplingStrategy'
+ - $ref: '#/components/schemas/TopKSamplingStrategy'
+ discriminator:
+ propertyName: type
+ mapping:
+ greedy: '#/components/schemas/GreedySamplingStrategy'
+ top_p: '#/components/schemas/TopPSamplingStrategy'
+ top_k: '#/components/schemas/TopKSamplingStrategy'
SystemMessage:
type: object
properties:
@@ -1488,12 +1577,20 @@ components:
type: string
const: system
default: system
+ description: >-
+ Must be "system" to identify this as a system message
content:
$ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ The content of the "system prompt". If multiple system messages are provided,
+ they are concatenated. The underlying Llama Stack code may also add other
+ system messages (for example, for formatting tool definitions).
additionalProperties: false
required:
- role
- content
+ title: >-
+ A system message providing instructions or context to the model.
TextContentItem:
type: object
properties:
@@ -1514,7 +1611,12 @@ components:
type: string
tool_name:
oneOf:
- - $ref: '#/components/schemas/BuiltinTool'
+ - type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
- type: string
arguments:
type: object
@@ -1546,17 +1648,17 @@ components:
- call_id
- tool_name
- arguments
- ToolChoice:
- type: string
- enum:
- - auto
- - required
ToolDefinition:
type: object
properties:
tool_name:
oneOf:
- - $ref: '#/components/schemas/BuiltinTool'
+ - type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
- type: string
description:
type: string
@@ -1588,22 +1690,6 @@ components:
additionalProperties: false
required:
- param_type
- ToolPromptFormat:
- type: string
- enum:
- - json
- - function_tag
- - python_list
- title: This Enum refers to the prompt format for calling custom / zero shot
- tools
- description: "`json` --\n Refers to the json format for calling tools.\n\
- \ The json format takes the form like\n {\n \"type\": \"function\"\
- ,\n \"function\" : {\n \"name\": \"function_name\",\n \
- \ \"description\": \"function_description\",\n \"parameters\"\
- : {...}\n }\n }\n\n`function_tag` --\n This is an example of
- how you could define\n your own user defined format for making tool calls.\n\
- \ The function_tag format looks like this,\n (parameters)\n
- \nThe detailed prompts for each of these formats are added to llama cli"
ToolResponseMessage:
type: object
properties:
@@ -1611,20 +1697,33 @@ components:
type: string
const: tool
default: tool
+ description: >-
+ Must be "tool" to identify this as a tool response
call_id:
type: string
+ description: >-
+ Unique identifier for the tool call this response is for
tool_name:
oneOf:
- - $ref: '#/components/schemas/BuiltinTool'
- type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
+ - type: string
+ description: Name of the tool that was called
content:
$ref: '#/components/schemas/InterleavedContent'
+ description: The response content from the tool
additionalProperties: false
required:
- role
- call_id
- tool_name
- content
+ title: >-
+ A message representing the result of a tool invocation.
TopKSamplingStrategy:
type: object
properties:
@@ -1668,14 +1767,23 @@ components:
type: string
const: user
default: user
+ description: >-
+ Must be "user" to identify this as a user message
content:
$ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ The content of the message, which can include text and other media
context:
$ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ (Optional) This field is used internally by Llama Stack to pass RAG context.
+ This field may be removed in the API in the future.
additionalProperties: false
required:
- role
- content
+ title: >-
+ A message from the user in a chat conversation.
BatchChatCompletionRequest:
type: object
properties:
@@ -1694,15 +1802,32 @@ components:
items:
$ref: '#/components/schemas/ToolDefinition'
tool_choice:
- $ref: '#/components/schemas/ToolChoice'
+ type: string
+ enum:
+ - auto
+ - required
+ title: >-
+ Whether tool use is required or automatic. This is a hint to the model
+ which may not be followed. It depends on the Instruction Following capabilities
+ of the model.
tool_prompt_format:
- $ref: '#/components/schemas/ToolPromptFormat'
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ title: >-
+ Prompt format for calling custom / zero shot tools.
+ response_format:
+ $ref: '#/components/schemas/ResponseFormat'
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
+ description: >-
+ How many tokens (for each position) to return log probabilities for.
additionalProperties: false
additionalProperties: false
required:
@@ -1711,13 +1836,42 @@ components:
BatchChatCompletionResponse:
type: object
properties:
- completion_message_batch:
+ batch:
type: array
items:
- $ref: '#/components/schemas/CompletionMessage'
+ $ref: '#/components/schemas/ChatCompletionResponse'
additionalProperties: false
required:
- - completion_message_batch
+ - batch
+ ChatCompletionResponse:
+ type: object
+ properties:
+ completion_message:
+ $ref: '#/components/schemas/CompletionMessage'
+ description: The complete response message
+ logprobs:
+ type: array
+ items:
+ $ref: '#/components/schemas/TokenLogProbs'
+ description: >-
+ Optional log probabilities for generated tokens
+ additionalProperties: false
+ required:
+ - completion_message
+ title: Response from a chat completion request.
+ TokenLogProbs:
+ type: object
+ properties:
+ logprobs_by_token:
+ type: object
+ additionalProperties:
+ type: number
+ description: >-
+ Dictionary mapping tokens to their log probabilities
+ additionalProperties: false
+ required:
+ - logprobs_by_token
+ title: Log probabilities for generated tokens.
BatchCompletionRequest:
type: object
properties:
@@ -1729,12 +1883,16 @@ components:
$ref: '#/components/schemas/InterleavedContent'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
+ response_format:
+ $ref: '#/components/schemas/ResponseFormat'
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
+ description: >-
+ How many tokens (for each position) to return log probabilities for.
additionalProperties: false
additionalProperties: false
required:
@@ -1743,13 +1901,37 @@ components:
BatchCompletionResponse:
type: object
properties:
- completion_message_batch:
+ batch:
type: array
items:
- $ref: '#/components/schemas/CompletionMessage'
+ $ref: '#/components/schemas/CompletionResponse'
additionalProperties: false
required:
- - completion_message_batch
+ - batch
+ CompletionResponse:
+ type: object
+ properties:
+ content:
+ type: string
+ description: The generated completion text
+ stop_reason:
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: Reason why generation stopped
+ logprobs:
+ type: array
+ items:
+ $ref: '#/components/schemas/TokenLogProbs'
+ description: >-
+ Optional log probabilities for generated tokens
+ additionalProperties: false
+ required:
+ - content
+ - stop_reason
+ title: Response from a completion request.
CancelTrainingJobRequest:
type: object
properties:
@@ -1758,137 +1940,124 @@ components:
additionalProperties: false
required:
- job_uuid
- GrammarResponseFormat:
- type: object
- properties:
- type:
- type: string
- const: grammar
- default: grammar
- bnf:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - type
- - bnf
- JsonSchemaResponseFormat:
- type: object
- properties:
- type:
- type: string
- const: json_schema
- default: json_schema
- json_schema:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - type
- - json_schema
- ResponseFormat:
- oneOf:
- - $ref: '#/components/schemas/JsonSchemaResponseFormat'
- - $ref: '#/components/schemas/GrammarResponseFormat'
- discriminator:
- propertyName: type
- mapping:
- json_schema: '#/components/schemas/JsonSchemaResponseFormat'
- grammar: '#/components/schemas/GrammarResponseFormat'
ChatCompletionRequest:
type: object
properties:
model_id:
type: string
+ description: >-
+ The identifier of the model to use. The model must be registered with
+ Llama Stack and available via the /models endpoint.
messages:
type: array
items:
$ref: '#/components/schemas/Message'
+ description: List of messages in the conversation
sampling_params:
$ref: '#/components/schemas/SamplingParams'
+ description: >-
+ Parameters to control the sampling strategy
tools:
type: array
items:
$ref: '#/components/schemas/ToolDefinition'
+ description: >-
+ (Optional) List of tool definitions available to the model
tool_choice:
- $ref: '#/components/schemas/ToolChoice'
+ type: string
+ enum:
+ - auto
+ - required
+ description: >-
+ (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
tool_prompt_format:
- $ref: '#/components/schemas/ToolPromptFormat'
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ description: >-
+ (Optional) Instructs the model how to format tool calls. By default, Llama
+ Stack will attempt to use a format that is best adapted to the model.
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
+ tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+ syntax -- a list of function calls.
response_format:
$ref: '#/components/schemas/ResponseFormat'
+ description: >-
+ (Optional) Grammar specification for guided (structured) decoding. There
+ are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
+ schema. Most providers support this format. - `ResponseFormat.grammar`:
+ The grammar is a BNF grammar. This format is more flexible, but not all
+ providers support it.
stream:
type: boolean
+ description: >-
+ (Optional) If True, generate an SSE event stream of the response. Defaults
+ to False.
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
+ description: >-
+ How many tokens (for each position) to return log probabilities for.
additionalProperties: false
+ description: >-
+ (Optional) If specified, log probabilities for each token position will
+ be returned.
additionalProperties: false
required:
- model_id
- messages
- ChatCompletionResponse:
- type: object
- properties:
- completion_message:
- $ref: '#/components/schemas/CompletionMessage'
- logprobs:
- type: array
- items:
- $ref: '#/components/schemas/TokenLogProbs'
- additionalProperties: false
- required:
- - completion_message
- title: Chat completion response.
ChatCompletionResponseEvent:
type: object
properties:
event_type:
- $ref: '#/components/schemas/ChatCompletionResponseEventType'
+ type: string
+ enum:
+ - start
+ - complete
+ - progress
+ description: Type of the event
delta:
$ref: '#/components/schemas/ContentDelta'
+ description: >-
+ Content generated since last event. This can be one or more tokens, or
+ a tool call.
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
+ description: >-
+ Optional log probabilities for generated tokens
stop_reason:
- $ref: '#/components/schemas/StopReason'
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Optional reason why generation stopped, if complete
additionalProperties: false
required:
- event_type
- delta
- title: Chat completion response event.
- ChatCompletionResponseEventType:
- type: string
- enum:
- - start
- - complete
- - progress
+ title: >-
+ An event during chat completion generation.
ChatCompletionResponseStreamChunk:
type: object
properties:
event:
$ref: '#/components/schemas/ChatCompletionResponseEvent'
+ description: The event containing the new content
additionalProperties: false
required:
- event
- title: SSE-stream of these events.
+ title: >-
+ A chunk of a streamed chat completion response.
ContentDelta:
oneOf:
- $ref: '#/components/schemas/TextDelta'
@@ -1927,16 +2096,6 @@ components:
required:
- type
- text
- TokenLogProbs:
- type: object
- properties:
- logprobs_by_token:
- type: object
- additionalProperties:
- type: number
- additionalProperties: false
- required:
- - logprobs_by_token
ToolCallDelta:
type: object
properties:
@@ -1949,74 +2108,83 @@ components:
- type: string
- $ref: '#/components/schemas/ToolCall'
parse_status:
- $ref: '#/components/schemas/ToolCallParseStatus'
+ type: string
+ enum:
+ - started
+ - in_progress
+ - failed
+ - succeeded
additionalProperties: false
required:
- type
- tool_call
- parse_status
- ToolCallParseStatus:
- type: string
- enum:
- - started
- - in_progress
- - failed
- - succeeded
CompletionRequest:
type: object
properties:
model_id:
type: string
+ description: >-
+ The identifier of the model to use. The model must be registered with
+ Llama Stack and available via the /models endpoint.
content:
$ref: '#/components/schemas/InterleavedContent'
+ description: The content to generate a completion for
sampling_params:
$ref: '#/components/schemas/SamplingParams'
+ description: >-
+ (Optional) Parameters to control the sampling strategy
response_format:
$ref: '#/components/schemas/ResponseFormat'
+ description: >-
+ (Optional) Grammar specification for guided (structured) decoding
stream:
type: boolean
+ description: >-
+ (Optional) If True, generate an SSE event stream of the response. Defaults
+ to False.
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
+ description: >-
+ How many tokens (for each position) to return log probabilities for.
additionalProperties: false
+ description: >-
+ (Optional) If specified, log probabilities for each token position will
+ be returned.
additionalProperties: false
required:
- model_id
- content
- CompletionResponse:
- type: object
- properties:
- content:
- type: string
- stop_reason:
- $ref: '#/components/schemas/StopReason'
- logprobs:
- type: array
- items:
- $ref: '#/components/schemas/TokenLogProbs'
- additionalProperties: false
- required:
- - content
- - stop_reason
- title: Completion response.
CompletionResponseStreamChunk:
type: object
properties:
delta:
type: string
+ description: >-
+ New content generated since last chunk. This can be one or more tokens.
stop_reason:
- $ref: '#/components/schemas/StopReason'
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Optional reason why generation stopped, if complete
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
+ description: >-
+ Optional log probabilities for generated tokens
additionalProperties: false
required:
- delta
- title: streamed completion response.
+ title: >-
+ A chunk of a streamed completion response.
AgentConfig:
type: object
properties:
@@ -2039,10 +2207,23 @@ components:
items:
$ref: '#/components/schemas/ToolDef'
tool_choice:
- $ref: '#/components/schemas/ToolChoice'
+ type: string
+ enum:
+ - auto
+ - required
+ title: >-
+ Whether tool use is required or automatic. This is a hint to the model
+ which may not be followed. It depends on the Instruction Following capabilities
+ of the model.
default: auto
tool_prompt_format:
- $ref: '#/components/schemas/ToolPromptFormat'
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ title: >-
+ Prompt format for calling custom / zero shot tools.
max_infer_iters:
type: integer
default: 10
@@ -2485,7 +2666,12 @@ components:
type: string
tool_name:
oneOf:
- - $ref: '#/components/schemas/BuiltinTool'
+ - type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
- type: string
content:
$ref: '#/components/schemas/InterleavedContent'
@@ -2558,7 +2744,8 @@ components:
- output_message
- output_attachments
- started_at
- title: A single turn in an interaction with an Agentic System.
+ title: >-
+ A single turn in an interaction with an Agentic System.
ViolationLevel:
type: string
enum:
@@ -2570,10 +2757,17 @@ components:
properties:
model_id:
type: string
+ description: >-
+ The identifier of the model to use. The model must be an embedding model
+ registered with Llama Stack and available via the /models endpoint.
contents:
type: array
items:
$ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ List of contents to generate embeddings for. Note that content can be
+ multimodal. The behavior depends on the model and provider. Some models
+ may only support text.
additionalProperties: false
required:
- model_id
@@ -2587,9 +2781,15 @@ components:
type: array
items:
type: number
+ description: >-
+ List of embedding vectors, one per input content. Each embedding is a
+ list of floats. The dimensionality of the embedding is model-specific;
+ you can check model metadata using /models/{model_id}
additionalProperties: false
required:
- embeddings
+ title: >-
+ Response containing generated embeddings.
AgentCandidate:
type: object
properties:
@@ -2845,7 +3045,8 @@ components:
- session_name
- turns
- started_at
- title: A single session of an interaction with an Agentic System.
+ title: >-
+ A single session of an interaction with an Agentic System.
AgentStepResponse:
type: object
properties:
@@ -3194,7 +3395,8 @@ components:
- provider_resource_id
- provider_id
- type
- title: A safety shield resource that can be used to check content
+ title: >-
+ A safety shield resource that can be used to check content
Span:
type: object
properties:
@@ -4684,8 +4886,9 @@ components:
additionalProperties: false
required:
- synthetic_data
- title: Response from the synthetic data generation. Batch of (prompt, response,
- score) tuples that pass the threshold.
+ title: >-
+ Response from the synthetic data generation. Batch of (prompt, response, score)
+ tuples that pass the threshold.
VersionInfo:
type: object
properties:
@@ -4698,415 +4901,54 @@ components:
security:
- Default: []
tags:
- - name: AgentCandidate
- description: ''
- - name: AgentConfig
- description: ''
- - name: AgentCreateResponse
- description: ''
- - name: AgentSessionCreateResponse
- description: ''
- - name: AgentStepResponse
- description: ''
- - name: AgentTool
- description: ''
- - name: AgentTurnInputType
- description: ''
- - name: AgentTurnResponseEvent
- description: ''
- - name: AgentTurnResponseEventPayload
- description: ''
- - name: AgentTurnResponseStepCompletePayload
- description: ''
- - name: AgentTurnResponseStepProgressPayload
- description: ''
- - name: AgentTurnResponseStepStartPayload
- description: ''
- - name: AgentTurnResponseStreamChunk
- description: streamed agent turn completion response.
- - name: AgentTurnResponseTurnCompletePayload
- description: ''
- - name: AgentTurnResponseTurnStartPayload
- description: ''
- name: Agents
- - name: AggregationFunctionType
- description: ''
- - name: AlgorithmConfig
- description: ''
- - name: AppEvalTaskConfig
- description: ''
- - name: AppendRowsRequest
- description: ''
- - name: ArrayType
- description: ''
- - name: BasicScoringFnParams
- description: ''
- - name: BatchChatCompletionRequest
- description: ''
- - name: BatchChatCompletionResponse
- description: ''
- - name: BatchCompletionRequest
- description: ''
- - name: BatchCompletionResponse
- description: ''
+ description: >-
+ Main functionalities provided by this API:
+
+ - Create agents with specific instructions and ability to use tools.
+
+ - Interactions with agents are grouped into sessions ("threads"), and each interaction
+ is called a "turn".
+
+ - Agents can be provided with various tools (see the ToolGroups and ToolRuntime
+ APIs for more details).
+
+ - Agents can be provided with various shields (see the Safety API for more details).
+
+ - Agents can also use Memory to retrieve information from knowledge bases. See
+ the RAG Tool and Vector IO APIs for more details.
+ x-displayName: >-
+ Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon)
- - name: BenchmarkEvalTaskConfig
- description: ''
- - name: BooleanType
- description: ''
- - name: BuiltinTool
- description: ''
- - name: CancelTrainingJobRequest
- description: ''
- - name: ChatCompletionInputType
- description: ''
- - name: ChatCompletionRequest
- description: ''
- - name: ChatCompletionResponse
- description: Chat completion response.
- - name: ChatCompletionResponseEvent
- description: Chat completion response event.
- - name: ChatCompletionResponseEventType
- description: ''
- - name: ChatCompletionResponseStreamChunk
- description: SSE-stream of these events.
- - name: Checkpoint
- description: Checkpoint created during training runs
- - name: CompletionInputType
- description: ''
- - name: CompletionMessage
- description: ''
- - name: CompletionRequest
- description: ''
- - name: CompletionResponse
- description: Completion response.
- - name: CompletionResponseStreamChunk
- description: streamed completion response.
- - name: ContentDelta
- description: ''
- - name: CreateAgentRequest
- description: ''
- - name: CreateAgentSessionRequest
- description: ''
- - name: CreateAgentTurnRequest
- description: ''
- - name: DPOAlignmentConfig
- description: ''
- - name: DataConfig
- description: ''
- - name: Dataset
- description: ''
- - name: DatasetFormat
- description: ''
- name: DatasetIO
- name: Datasets
- - name: DefaultRAGQueryGeneratorConfig
- description: ''
- - name: EfficiencyConfig
- description: ''
- - name: EmbeddingsRequest
- description: ''
- - name: EmbeddingsResponse
- description: ''
- name: Eval
- - name: EvalCandidate
- description: ''
- - name: EvalTask
- description: ''
- - name: EvalTaskConfig
- description: ''
- name: EvalTasks
- - name: EvaluateResponse
- description: ''
- - name: EvaluateRowsRequest
- description: ''
- - name: Event
- description: ''
- - name: GrammarResponseFormat
- description: ''
- - name: GreedySamplingStrategy
- description: ''
- - name: HealthInfo
- description: ''
- - name: ImageContentItem
- description: ''
- - name: ImageDelta
- description: ''
- name: Inference
- - name: InferenceStep
- description: ''
- - name: InsertChunksRequest
- description: ''
- - name: InsertRequest
- description: ''
+ description: >-
+ This API provides the raw interface to the underlying models. Two kinds of models
+ are supported:
+
+ - LLM models: these models generate "raw" and "chat" (conversational) completions.
+
+ - Embedding models: these models generate embeddings to be used for semantic
+ search.
+ x-displayName: >-
+ Llama Stack Inference API for generating completions, chat completions, and
+ embeddings.
- name: Inspect
- - name: InterleavedContent
- description: ''
- - name: InterleavedContentItem
- description: ''
- - name: InvokeToolRequest
- description: ''
- - name: Job
- description: ''
- - name: JobStatus
- description: ''
- - name: JsonSchemaResponseFormat
- description: ''
- - name: JsonType
- description: ''
- - name: LLMAsJudgeScoringFnParams
- description: ''
- - name: LLMRAGQueryGeneratorConfig
- description: ''
- - name: ListDatasetsResponse
- description: ''
- - name: ListEvalTasksResponse
- description: ''
- - name: ListModelsResponse
- description: ''
- - name: ListPostTrainingJobsResponse
- description: ''
- - name: ListProvidersResponse
- description: ''
- - name: ListRoutesResponse
- description: ''
- - name: ListScoringFunctionsResponse
- description: ''
- - name: ListShieldsResponse
- description: ''
- - name: ListToolGroupsResponse
- description: ''
- - name: ListToolsResponse
- description: ''
- - name: ListVectorDBsResponse
- description: ''
- - name: LogEventRequest
- description: ''
- - name: LogSeverity
- description: ''
- - name: LoraFinetuningConfig
- description: ''
- - name: MemoryRetrievalStep
- description: ''
- - name: Message
- description: ''
- - name: MetricEvent
- description: ''
- - name: Model
- description: ''
- - name: ModelCandidate
- description: ''
- - name: ModelType
- description: ''
- name: Models
- - name: NumberType
- description: ''
- - name: ObjectType
- description: ''
- - name: OptimizerConfig
- description: ''
- - name: OptimizerType
- description: ''
- - name: PaginatedRowsResult
- description: ''
- - name: ParamType
- description: ''
- name: PostTraining (Coming Soon)
- - name: PostTrainingJob
- description: ''
- - name: PostTrainingJobArtifactsResponse
- description: Artifacts of a finetuning job.
- - name: PostTrainingJobStatusResponse
- description: Status of a finetuning job.
- - name: PreferenceOptimizeRequest
- description: ''
- - name: ProviderInfo
- description: ''
- - name: QATFinetuningConfig
- description: ''
- - name: QueryChunksRequest
- description: ''
- - name: QueryChunksResponse
- description: ''
- - name: QueryCondition
- description: ''
- - name: QueryConditionOp
- description: ''
- - name: QueryRequest
- description: ''
- - name: QuerySpanTreeResponse
- description: ''
- - name: QuerySpansResponse
- description: ''
- - name: QueryTracesResponse
- description: ''
- - name: RAGDocument
- description: ''
- - name: RAGQueryConfig
- description: ''
- - name: RAGQueryGeneratorConfig
- description: ''
- - name: RAGQueryResult
- description: ''
- - name: RegexParserScoringFnParams
- description: ''
- - name: RegisterDatasetRequest
- description: ''
- - name: RegisterEvalTaskRequest
- description: ''
- - name: RegisterModelRequest
- description: ''
- - name: RegisterScoringFunctionRequest
- description: ''
- - name: RegisterShieldRequest
- description: ''
- - name: RegisterToolGroupRequest
- description: ''
- - name: RegisterVectorDbRequest
- description: ''
- - name: ResponseFormat
- description: ''
- - name: RouteInfo
- description: ''
- - name: RunEvalRequest
- description: ''
- - name: RunShieldRequest
- description: ''
- - name: RunShieldResponse
- description: ''
- name: Safety
- - name: SafetyViolation
- description: ''
- - name: SamplingParams
- description: ''
- - name: SaveSpansToDatasetRequest
- description: ''
- - name: ScoreBatchRequest
- description: ''
- - name: ScoreBatchResponse
- description: ''
- - name: ScoreRequest
- description: ''
- - name: ScoreResponse
- description: ''
- name: Scoring
- - name: ScoringFn
- description: ''
- - name: ScoringFnParams
- description: ''
- name: ScoringFunctions
- - name: ScoringResult
- description: ''
- - name: Session
- description: A single session of an interaction with an Agentic System.
- - name: Shield
- description: A safety shield resource that can be used to check content
- - name: ShieldCallStep
- description: ''
- name: Shields
- - name: Span
- description: ''
- - name: SpanEndPayload
- description: ''
- - name: SpanStartPayload
- description: ''
- - name: SpanStatus
- description: ''
- - name: SpanWithStatus
- description: ''
- - name: StopReason
- description: ''
- - name: StringType
- description: ''
- - name: StructuredLogEvent
- description: ''
- - name: StructuredLogPayload
- description: ''
- - name: SupervisedFineTuneRequest
- description: ''
- - name: SyntheticDataGenerateRequest
- description: ''
- name: SyntheticDataGeneration (Coming Soon)
- - name: SyntheticDataGenerationResponse
- description: Response from the synthetic data generation. Batch of (prompt, response,
- score) tuples that pass the threshold.
- - name: SystemMessage
- description: ''
- name: Telemetry
- - name: TextContentItem
- description: ''
- - name: TextDelta
- description: ''
- - name: TokenLogProbs
- description: ''
- - name: Tool
- description: ''
- - name: ToolCall
- description: ''
- - name: ToolCallDelta
- description: ''
- - name: ToolCallParseStatus
- description: ''
- - name: ToolChoice
- description: ''
- - name: ToolDef
- description: ''
- - name: ToolDefinition
- description: ''
- - name: ToolExecutionStep
- description: ''
- - name: ToolGroup
- description: ''
- name: ToolGroups
- - name: ToolHost
- description: ''
- - name: ToolInvocationResult
- description: ''
- - name: ToolParamDefinition
- description: ''
- - name: ToolParameter
- description: ''
- - name: ToolPromptFormat
- description: "This Enum refers to the prompt format for calling custom / zero
- shot tools\n\n`json` --\n Refers to the json format for calling tools.\n\
- \ The json format takes the form like\n {\n \"type\": \"function\"\
- ,\n \"function\" : {\n \"name\": \"function_name\",\n \
- \ \"description\": \"function_description\",\n \"parameters\"\
- : {...}\n }\n }\n\n`function_tag` --\n This is an example of how
- you could define\n your own user defined format for making tool calls.\n\
- \ The function_tag format looks like this,\n (parameters)\n
- \nThe detailed prompts for each of these formats are added to llama cli"
- - name: ToolResponse
- description: ''
- - name: ToolResponseMessage
- description: ''
- name: ToolRuntime
- - name: TopKSamplingStrategy
- description: ''
- - name: TopPSamplingStrategy
- description: ''
- - name: Trace
- description: ''
- - name: TrainingConfig
- description: ''
- - name: Turn
- description: A single turn in an interaction with an Agentic System.
- - name: URL
- description: ''
- - name: UnionType
- description: ''
- - name: UnstructuredLogEvent
- description: ''
- - name: UserMessage
- description: ''
- - name: VectorDB
- description: ''
- name: VectorDBs
- name: VectorIO
- - name: VersionInfo
- description: ''
- - name: ViolationLevel
- description: ''
x-tagGroups:
- name: Operations
tags:
@@ -5130,195 +4972,3 @@ x-tagGroups:
- ToolRuntime
- VectorDBs
- VectorIO
- - name: Types
- tags:
- - AgentCandidate
- - AgentConfig
- - AgentCreateResponse
- - AgentSessionCreateResponse
- - AgentStepResponse
- - AgentTool
- - AgentTurnInputType
- - AgentTurnResponseEvent
- - AgentTurnResponseEventPayload
- - AgentTurnResponseStepCompletePayload
- - AgentTurnResponseStepProgressPayload
- - AgentTurnResponseStepStartPayload
- - AgentTurnResponseStreamChunk
- - AgentTurnResponseTurnCompletePayload
- - AgentTurnResponseTurnStartPayload
- - AggregationFunctionType
- - AlgorithmConfig
- - AppEvalTaskConfig
- - AppendRowsRequest
- - ArrayType
- - BasicScoringFnParams
- - BatchChatCompletionRequest
- - BatchChatCompletionResponse
- - BatchCompletionRequest
- - BatchCompletionResponse
- - BenchmarkEvalTaskConfig
- - BooleanType
- - BuiltinTool
- - CancelTrainingJobRequest
- - ChatCompletionInputType
- - ChatCompletionRequest
- - ChatCompletionResponse
- - ChatCompletionResponseEvent
- - ChatCompletionResponseEventType
- - ChatCompletionResponseStreamChunk
- - Checkpoint
- - CompletionInputType
- - CompletionMessage
- - CompletionRequest
- - CompletionResponse
- - CompletionResponseStreamChunk
- - ContentDelta
- - CreateAgentRequest
- - CreateAgentSessionRequest
- - CreateAgentTurnRequest
- - DPOAlignmentConfig
- - DataConfig
- - Dataset
- - DatasetFormat
- - DefaultRAGQueryGeneratorConfig
- - EfficiencyConfig
- - EmbeddingsRequest
- - EmbeddingsResponse
- - EvalCandidate
- - EvalTask
- - EvalTaskConfig
- - EvaluateResponse
- - EvaluateRowsRequest
- - Event
- - GrammarResponseFormat
- - GreedySamplingStrategy
- - HealthInfo
- - ImageContentItem
- - ImageDelta
- - InferenceStep
- - InsertChunksRequest
- - InsertRequest
- - InterleavedContent
- - InterleavedContentItem
- - InvokeToolRequest
- - Job
- - JobStatus
- - JsonSchemaResponseFormat
- - JsonType
- - LLMAsJudgeScoringFnParams
- - LLMRAGQueryGeneratorConfig
- - ListDatasetsResponse
- - ListEvalTasksResponse
- - ListModelsResponse
- - ListPostTrainingJobsResponse
- - ListProvidersResponse
- - ListRoutesResponse
- - ListScoringFunctionsResponse
- - ListShieldsResponse
- - ListToolGroupsResponse
- - ListToolsResponse
- - ListVectorDBsResponse
- - LogEventRequest
- - LogSeverity
- - LoraFinetuningConfig
- - MemoryRetrievalStep
- - Message
- - MetricEvent
- - Model
- - ModelCandidate
- - ModelType
- - NumberType
- - ObjectType
- - OptimizerConfig
- - OptimizerType
- - PaginatedRowsResult
- - ParamType
- - PostTrainingJob
- - PostTrainingJobArtifactsResponse
- - PostTrainingJobStatusResponse
- - PreferenceOptimizeRequest
- - ProviderInfo
- - QATFinetuningConfig
- - QueryChunksRequest
- - QueryChunksResponse
- - QueryCondition
- - QueryConditionOp
- - QueryRequest
- - QuerySpanTreeResponse
- - QuerySpansResponse
- - QueryTracesResponse
- - RAGDocument
- - RAGQueryConfig
- - RAGQueryGeneratorConfig
- - RAGQueryResult
- - RegexParserScoringFnParams
- - RegisterDatasetRequest
- - RegisterEvalTaskRequest
- - RegisterModelRequest
- - RegisterScoringFunctionRequest
- - RegisterShieldRequest
- - RegisterToolGroupRequest
- - RegisterVectorDbRequest
- - ResponseFormat
- - RouteInfo
- - RunEvalRequest
- - RunShieldRequest
- - RunShieldResponse
- - SafetyViolation
- - SamplingParams
- - SaveSpansToDatasetRequest
- - ScoreBatchRequest
- - ScoreBatchResponse
- - ScoreRequest
- - ScoreResponse
- - ScoringFn
- - ScoringFnParams
- - ScoringResult
- - Session
- - Shield
- - ShieldCallStep
- - Span
- - SpanEndPayload
- - SpanStartPayload
- - SpanStatus
- - SpanWithStatus
- - StopReason
- - StringType
- - StructuredLogEvent
- - StructuredLogPayload
- - SupervisedFineTuneRequest
- - SyntheticDataGenerateRequest
- - SyntheticDataGenerationResponse
- - SystemMessage
- - TextContentItem
- - TextDelta
- - TokenLogProbs
- - Tool
- - ToolCall
- - ToolCallDelta
- - ToolCallParseStatus
- - ToolChoice
- - ToolDef
- - ToolDefinition
- - ToolExecutionStep
- - ToolGroup
- - ToolHost
- - ToolInvocationResult
- - ToolParamDefinition
- - ToolParameter
- - ToolPromptFormat
- - ToolResponse
- - ToolResponseMessage
- - TopKSamplingStrategy
- - TopPSamplingStrategy
- - Trace
- - TrainingConfig
- - Turn
- - URL
- - UnionType
- - UnstructuredLogEvent
- - UserMessage
- - VectorDB
- - VersionInfo
- - ViolationLevel
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index d41abc846..68eecaccb 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -297,6 +297,16 @@ class AgentStepResponse(BaseModel):
@runtime_checkable
@trace_protocol
class Agents(Protocol):
+ """Agents API for creating and interacting with agentic systems.
+
+ Main functionalities provided by this API:
+ - Create agents with specific instructions and ability to use tools.
+ - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
+ - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
+ - Agents can be provided with various shields (see the Safety API for more details).
+ - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
+ """
+
@webmethod(route="/agents", method="POST")
async def create_agent(
self,
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index ca5ba059f..413c81c5a 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -7,13 +7,15 @@
from typing import List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
from llama_stack.apis.inference import (
- CompletionMessage,
+ ChatCompletionResponse,
+ CompletionResponse,
InterleavedContent,
LogProbConfig,
Message,
+ ResponseFormat,
SamplingParams,
ToolChoice,
ToolDefinition,
@@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
)
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
- model: str
- content_batch: List[InterleavedContent]
- sampling_params: Optional[SamplingParams] = SamplingParams()
- logprobs: Optional[LogProbConfig] = None
-
-
@json_schema_type
class BatchCompletionResponse(BaseModel):
- completion_message_batch: List[CompletionMessage]
-
-
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
- model: str
- messages_batch: List[List[Message]]
- sampling_params: Optional[SamplingParams] = SamplingParams()
-
- # zero-shot tool definitions as input to the model
- tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
- tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
- tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
- logprobs: Optional[LogProbConfig] = None
+ batch: List[CompletionResponse]
@json_schema_type
class BatchChatCompletionResponse(BaseModel):
- completion_message_batch: List[CompletionMessage]
+ batch: List[ChatCompletionResponse]
@runtime_checkable
@@ -60,6 +41,7 @@ class BatchInference(Protocol):
model: str,
content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = SamplingParams(),
+ response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ...
@@ -73,5 +55,6 @@ class BatchInference(Protocol):
tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
+ response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
) -> BatchChatCompletionResponse: ...
diff --git a/llama_stack/apis/common/content_types.py b/llama_stack/apis/common/content_types.py
index 1d8cea567..0b27a0196 100644
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@@ -77,7 +77,6 @@ class ImageDelta(BaseModel):
image: bytes
-@json_schema_type
class ToolCallParseStatus(Enum):
started = "started"
in_progress = "in_progress"
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 871f1f633..2debce1a7 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -35,11 +35,22 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
class LogProbConfig(BaseModel):
+ """
+
+ :param top_k: How many tokens (for each position) to return log probabilities for.
+ """
+
top_k: Optional[int] = 0
-@json_schema_type
class QuantizationType(Enum):
+ """Type of model quantization to run inference with.
+
+ :cvar bf16: BFloat16 typically this means _no_ quantization
+ :cvar fp8: 8-bit floating point quantization
+ :cvar int4: 4-bit integer quantization
+ """
+
bf16 = "bf16"
fp8 = "fp8"
int4 = "int4"
@@ -57,6 +68,12 @@ class Bf16QuantizationConfig(BaseModel):
@json_schema_type
class Int4QuantizationConfig(BaseModel):
+ """Configuration for 4-bit integer quantization.
+
+ :param type: Must be "int4" to identify this quantization type
+ :param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
+ """
+
type: Literal["int4"] = "int4"
scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
@@ -69,6 +86,13 @@ QuantizationConfig = Annotated[
@json_schema_type
class UserMessage(BaseModel):
+ """A message from the user in a chat conversation.
+
+ :param role: Must be "user" to identify this as a user message
+ :param content: The content of the message, which can include text and other media
+ :param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
+ """
+
role: Literal["user"] = "user"
content: InterleavedContent
context: Optional[InterleavedContent] = None
@@ -76,15 +100,27 @@ class UserMessage(BaseModel):
@json_schema_type
class SystemMessage(BaseModel):
+ """A system message providing instructions or context to the model.
+
+ :param role: Must be "system" to identify this as a system message
+ :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
+ """
+
role: Literal["system"] = "system"
content: InterleavedContent
@json_schema_type
class ToolResponseMessage(BaseModel):
+ """A message representing the result of a tool invocation.
+
+ :param role: Must be "tool" to identify this as a tool response
+ :param call_id: Unique identifier for the tool call this response is for
+ :param tool_name: Name of the tool that was called
+ :param content: The response content from the tool
+ """
+
role: Literal["tool"] = "tool"
- # it was nice to re-use the ToolResponse type, but having all messages
- # have a `content` type makes things nicer too
call_id: str
tool_name: Union[BuiltinTool, str]
content: InterleavedContent
@@ -92,6 +128,17 @@ class ToolResponseMessage(BaseModel):
@json_schema_type
class CompletionMessage(BaseModel):
+ """A message containing the model's (assistant) response in a chat conversation.
+
+ :param role: Must be "assistant" to identify this as the model's response
+ :param content: The content of the model's response
+ :param stop_reason: Reason why the model stopped generating. Options are:
+ - `StopReason.end_of_turn`: The model finished generating the entire response.
+ - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
+ - `StopReason.out_of_tokens`: The model ran out of token budget.
+ :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
+ """
+
role: Literal["assistant"] = "assistant"
content: InterleavedContent
stop_reason: StopReason
@@ -129,19 +176,35 @@ class ToolResponse(BaseModel):
return v
-@json_schema_type
class ToolChoice(Enum):
+ """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
+
+ :cvar auto: The model may use tools if it determines that is appropriate.
+ :cvar required: The model must use tools.
+ """
+
auto = "auto"
required = "required"
@json_schema_type
class TokenLogProbs(BaseModel):
+ """Log probabilities for generated tokens.
+
+ :param logprobs_by_token: Dictionary mapping tokens to their log probabilities
+ """
+
logprobs_by_token: Dict[str, float]
-@json_schema_type
class ChatCompletionResponseEventType(Enum):
+ """Types of events that can occur during chat completion.
+
+ :cvar start: Inference has started
+ :cvar complete: Inference is complete and a full response is available
+ :cvar progress: Inference is in progress and a partial response is available
+ """
+
start = "start"
complete = "complete"
progress = "progress"
@@ -149,7 +212,13 @@ class ChatCompletionResponseEventType(Enum):
@json_schema_type
class ChatCompletionResponseEvent(BaseModel):
- """Chat completion response event."""
+ """An event during chat completion generation.
+
+ :param event_type: Type of the event
+ :param delta: Content generated since last event. This can be one or more tokens, or a tool call.
+ :param logprobs: Optional log probabilities for generated tokens
+ :param stop_reason: Optional reason why generation stopped, if complete
+ """
event_type: ChatCompletionResponseEventType
delta: ContentDelta
@@ -157,14 +226,25 @@ class ChatCompletionResponseEvent(BaseModel):
stop_reason: Optional[StopReason] = None
-@json_schema_type
class ResponseFormatType(Enum):
+ """Types of formats for structured (guided) decoding.
+
+ :cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
+ :cvar grammar: Response should conform to a BNF grammar
+ """
+
json_schema = "json_schema"
grammar = "grammar"
@json_schema_type
class JsonSchemaResponseFormat(BaseModel):
+ """Configuration for JSON schema-guided response generation.
+
+ :param type: Must be "json_schema" to identify this format type
+ :param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
+ """
+
type: Literal[ResponseFormatType.json_schema.value] = (
ResponseFormatType.json_schema.value
)
@@ -173,6 +253,12 @@ class JsonSchemaResponseFormat(BaseModel):
@json_schema_type
class GrammarResponseFormat(BaseModel):
+ """Configuration for grammar-guided response generation.
+
+ :param type: Must be "grammar" to identify this format type
+ :param bnf: The BNF grammar specification the response should conform to
+ """
+
type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
bnf: Dict[str, Any]
@@ -186,20 +272,24 @@ ResponseFormat = register_schema(
)
-@json_schema_type
+# This is an internally used class
class CompletionRequest(BaseModel):
model: str
content: InterleavedContent
sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None
-
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class CompletionResponse(BaseModel):
- """Completion response."""
+ """Response from a completion request.
+
+ :param content: The generated completion text
+ :param stop_reason: Reason why generation stopped
+ :param logprobs: Optional log probabilities for generated tokens
+ """
content: str
stop_reason: StopReason
@@ -208,80 +298,60 @@ class CompletionResponse(BaseModel):
@json_schema_type
class CompletionResponseStreamChunk(BaseModel):
- """streamed completion response."""
+ """A chunk of a streamed completion response.
+
+ :param delta: New content generated since last chunk. This can be one or more tokens.
+ :param stop_reason: Optional reason why generation stopped, if complete
+ :param logprobs: Optional log probabilities for generated tokens
+ """
delta: str
stop_reason: Optional[StopReason] = None
logprobs: Optional[List[TokenLogProbs]] = None
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
- model: str
- content_batch: List[InterleavedContent]
- sampling_params: Optional[SamplingParams] = SamplingParams()
- response_format: Optional[ResponseFormat] = None
- logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchCompletionResponse(BaseModel):
- """Batch completion response."""
-
- batch: List[CompletionResponse]
-
-
-@json_schema_type
+# This is an internally used class
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
sampling_params: Optional[SamplingParams] = SamplingParams()
-
- # zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
response_format: Optional[ResponseFormat] = None
-
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class ChatCompletionResponseStreamChunk(BaseModel):
- """SSE-stream of these events."""
+ """A chunk of a streamed chat completion response.
+
+ :param event: The event containing the new content
+ """
event: ChatCompletionResponseEvent
@json_schema_type
class ChatCompletionResponse(BaseModel):
- """Chat completion response."""
+ """Response from a chat completion request.
+
+ :param completion_message: The complete response message
+ :param logprobs: Optional log probabilities for generated tokens
+ """
completion_message: CompletionMessage
logprobs: Optional[List[TokenLogProbs]] = None
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
- model: str
- messages_batch: List[List[Message]]
- sampling_params: Optional[SamplingParams] = SamplingParams()
-
- # zero-shot tool definitions as input to the model
- tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
- tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
- tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
- logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
- batch: List[ChatCompletionResponse]
-
-
@json_schema_type
class EmbeddingsResponse(BaseModel):
+ """Response containing generated embeddings.
+
+ :param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+ """
+
embeddings: List[List[float]]
@@ -292,6 +362,13 @@ class ModelStore(Protocol):
@runtime_checkable
@trace_protocol
class Inference(Protocol):
+ """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+
+ This API provides the raw interface to the underlying models. Two kinds of models are supported:
+ - LLM models: these models generate "raw" and "chat" (conversational) completions.
+ - Embedding models: these models generate embeddings to be used for semantic search.
+ """
+
model_store: ModelStore
@webmethod(route="/inference/completion", method="POST")
@@ -303,7 +380,19 @@ class Inference(Protocol):
response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
- ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+ ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+ """Generate a completion for the given content using the specified model.
+
+ :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+ :param content: The content to generate a completion for
+ :param sampling_params: (Optional) Parameters to control the sampling strategy
+ :param response_format: (Optional) Grammar specification for guided (structured) decoding
+ :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+ :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+ :returns: If stream=False, returns a CompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+ """
+ ...
@webmethod(route="/inference/chat-completion", method="POST")
async def chat_completion(
@@ -311,7 +400,6 @@ class Inference(Protocol):
model_id: str,
messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(),
- # zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
@@ -320,11 +408,38 @@ class Inference(Protocol):
logprobs: Optional[LogProbConfig] = None,
) -> Union[
ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
- ]: ...
+ ]:
+ """Generate a chat completion for the given messages using the specified model.
+
+ :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+ :param messages: List of messages in the conversation
+ :param sampling_params: Parameters to control the sampling strategy
+ :param tools: (Optional) List of tool definitions available to the model
+ :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+ :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag.
+ - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
+ :param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
+ - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
+ - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
+ :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+ :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+ :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+ """
+ ...
@webmethod(route="/inference/embeddings", method="POST")
async def embeddings(
self,
model_id: str,
contents: List[InterleavedContent],
- ) -> EmbeddingsResponse: ...
+ ) -> EmbeddingsResponse:
+ """Generate embeddings for content pieces using the specified model.
+
+ :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+ :param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text.
+ :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+ """
+ ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index d0ce72644..b84c619e4 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -6,11 +6,9 @@
from enum import Enum
-from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
-@json_schema_type
class ResourceType(Enum):
model = "model"
shield = "shield"
diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index b2b290c66..fc9ee816c 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -339,7 +339,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
method=options.method,
url=options.url,
params=options.params,
- headers=options.headers,
+ headers=options.headers or {},
json=options.json_data,
),
)
@@ -388,7 +388,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
method=options.method,
url=options.url,
params=options.params,
- headers=options.headers,
+ headers=options.headers or {},
json=options.json_data,
),
)