Add and review more documentation for inference.py

This commit is contained in:
Ashwin Bharambe 2025-01-29 06:32:54 -08:00
parent ebfa8ad4fb
commit 62c3c5bb7e
4 changed files with 415 additions and 117 deletions

View file

@ -4,11 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import collections
import hashlib
import ipaddress
import typing
from dataclasses import field, make_dataclass
from dataclasses import make_dataclass
from typing import Any, Dict, Set, Union
from ..strong_typing.core import JsonType

View file

@ -487,7 +487,7 @@
"post": {
"responses": {
"200": {
"description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
"description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}",
"content": {
"application/json": {
"schema": {
@ -2352,19 +2352,23 @@
"role": {
"type": "string",
"const": "assistant",
"default": "assistant"
"default": "assistant",
"description": "Must be \"assistant\" to identify this as the model's response"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent"
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the model's response"
},
"stop_reason": {
"$ref": "#/components/schemas/StopReason"
"$ref": "#/components/schemas/StopReason",
"description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
},
"tool_calls": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolCall"
}
},
"description": "List of tool calls. Each tool call is a ToolCall object."
}
},
"additionalProperties": false,
@ -2373,7 +2377,8 @@
"content",
"stop_reason",
"tool_calls"
]
],
"title": "A message containing the model's (assistant) response in a chat conversation."
},
"GrammarResponseFormat": {
"type": "object",
@ -2381,7 +2386,8 @@
"type": {
"type": "string",
"const": "grammar",
"default": "grammar"
"default": "grammar",
"description": "Must be \"grammar\" to identify this format type"
},
"bnf": {
"type": "object",
@ -2406,14 +2412,16 @@
"type": "object"
}
]
}
},
"description": "The BNF grammar specification the response should conform to"
}
},
"additionalProperties": false,
"required": [
"type",
"bnf"
]
],
"title": "Configuration for grammar-guided response generation."
},
"GreedySamplingStrategy": {
"type": "object",
@ -2496,7 +2504,8 @@
"type": {
"type": "string",
"const": "json_schema",
"default": "json_schema"
"default": "json_schema",
"description": "Must be \"json_schema\" to identify this format type"
},
"json_schema": {
"type": "object",
@ -2521,14 +2530,16 @@
"type": "object"
}
]
}
},
"description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
}
},
"additionalProperties": false,
"required": [
"type",
"json_schema"
]
],
"title": "Configuration for JSON schema-guided response generation."
},
"Message": {
"oneOf": [
@ -2624,17 +2635,20 @@
"role": {
"type": "string",
"const": "system",
"default": "system"
"default": "system",
"description": "Must be \"system\" to identify this as a system message"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent"
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
]
],
"title": "A system message providing instructions or context to the model."
},
"TextContentItem": {
"type": "object",
@ -2749,7 +2763,8 @@
"enum": [
"auto",
"required"
]
],
"title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
},
"ToolDefinition": {
"type": "object",
@ -2836,10 +2851,12 @@
"role": {
"type": "string",
"const": "tool",
"default": "tool"
"default": "tool",
"description": "Must be \"tool\" to identify this as a tool response"
},
"call_id": {
"type": "string"
"type": "string",
"description": "Unique identifier for the tool call this response is for"
},
"tool_name": {
"oneOf": [
@ -2849,10 +2866,12 @@
{
"type": "string"
}
]
],
"description": "Name of the tool that was called"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent"
"$ref": "#/components/schemas/InterleavedContent",
"description": "The response content from the tool"
}
},
"additionalProperties": false,
@ -2861,7 +2880,8 @@
"call_id",
"tool_name",
"content"
]
],
"title": "A message representing the result of a tool invocation."
},
"TopKSamplingStrategy": {
"type": "object",
@ -2920,20 +2940,24 @@
"role": {
"type": "string",
"const": "user",
"default": "user"
"default": "user",
"description": "Must be \"user\" to identify this as a user message"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent"
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the message, which can include text and other media"
},
"context": {
"$ref": "#/components/schemas/InterleavedContent"
"$ref": "#/components/schemas/InterleavedContent",
"description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
]
],
"title": "A message from the user in a chat conversation."
},
"BatchChatCompletionRequest": {
"type": "object",
@ -2973,7 +2997,8 @@
"properties": {
"top_k": {
"type": "integer",
"default": 0
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false
@ -3004,19 +3029,22 @@
"type": "object",
"properties": {
"completion_message": {
"$ref": "#/components/schemas/CompletionMessage"
"$ref": "#/components/schemas/CompletionMessage",
"description": "The complete response message"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
}
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"completion_message"
]
],
"title": "Response from a chat completion request."
},
"TokenLogProbs": {
"type": "object",
@ -3025,13 +3053,15 @@
"type": "object",
"additionalProperties": {
"type": "number"
}
},
"description": "Dictionary mapping tokens to their log probabilities"
}
},
"additionalProperties": false,
"required": [
"logprobs_by_token"
]
],
"title": "Log probabilities for generated tokens."
},
"BatchCompletionRequest": {
"type": "object",
@ -3056,7 +3086,8 @@
"properties": {
"top_k": {
"type": "integer",
"default": 0
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false
@ -3087,16 +3118,19 @@
"type": "object",
"properties": {
"content": {
"type": "string"
"type": "string",
"description": "The generated completion text"
},
"stop_reason": {
"$ref": "#/components/schemas/StopReason"
"$ref": "#/components/schemas/StopReason",
"description": "Reason why generation stopped"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
}
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
@ -3104,7 +3138,7 @@
"content",
"stop_reason"
],
"title": "Completion response."
"title": "Response from a completion request."
},
"CancelTrainingJobRequest": {
"type": "object",
@ -3123,7 +3157,7 @@
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use"
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"messages": {
"type": "array",
@ -3149,11 +3183,11 @@
},
"tool_prompt_format": {
"$ref": "#/components/schemas/ToolPromptFormat",
"description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat",
"description": "(Optional) Grammar specification for guided (structured) decoding"
"description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
},
"stream": {
"type": "boolean",
@ -3164,7 +3198,8 @@
"properties": {
"top_k": {
"type": "integer",
"default": 0
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
@ -3181,19 +3216,23 @@
"type": "object",
"properties": {
"event_type": {
"$ref": "#/components/schemas/ChatCompletionResponseEventType"
"$ref": "#/components/schemas/ChatCompletionResponseEventType",
"description": "Type of the event"
},
"delta": {
"$ref": "#/components/schemas/ContentDelta"
"$ref": "#/components/schemas/ContentDelta",
"description": "Content generated since last event. This can be one or more tokens, or a tool call."
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
}
},
"description": "Optional log probabilities for generated tokens"
},
"stop_reason": {
"$ref": "#/components/schemas/StopReason"
"$ref": "#/components/schemas/StopReason",
"description": "Optional reason why generation stopped, if complete"
}
},
"additionalProperties": false,
@ -3201,7 +3240,7 @@
"event_type",
"delta"
],
"title": "Chat completion response event."
"title": "An event during chat completion generation."
},
"ChatCompletionResponseEventType": {
"type": "string",
@ -3209,19 +3248,22 @@
"start",
"complete",
"progress"
]
],
"title": "Types of events that can occur during chat completion."
},
"ChatCompletionResponseStreamChunk": {
"type": "object",
"properties": {
"event": {
"$ref": "#/components/schemas/ChatCompletionResponseEvent"
"$ref": "#/components/schemas/ChatCompletionResponseEvent",
"description": "The event containing the new content"
}
},
"additionalProperties": false,
"required": [
"event"
]
],
"title": "A chunk of a streamed chat completion response."
},
"ContentDelta": {
"oneOf": [
@ -3324,7 +3366,7 @@
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use"
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
@ -3347,7 +3389,8 @@
"properties": {
"top_k": {
"type": "integer",
"default": 0
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
@ -3364,23 +3407,26 @@
"type": "object",
"properties": {
"delta": {
"type": "string"
"type": "string",
"description": "New content generated since last chunk. This can be one or more tokens."
},
"stop_reason": {
"$ref": "#/components/schemas/StopReason"
"$ref": "#/components/schemas/StopReason",
"description": "Optional reason why generation stopped, if complete"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
}
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"delta"
],
"title": "streamed completion response."
"title": "A chunk of a streamed completion response."
},
"AgentConfig": {
"type": "object",
@ -4264,14 +4310,14 @@
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use"
"description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
},
"contents": {
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContent"
},
"description": "List of contents to generate embeddings for. Note that content can be multimodal."
"description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
}
},
"additionalProperties": false,
@ -4290,13 +4336,15 @@
"items": {
"type": "number"
}
}
},
"description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
}
},
"additionalProperties": false,
"required": [
"embeddings"
]
],
"title": "Response containing generated embeddings."
},
"AgentCandidate": {
"type": "object",
@ -7887,19 +7935,19 @@
},
{
"name": "ChatCompletionResponse",
"description": ""
"description": "Response from a chat completion request."
},
{
"name": "ChatCompletionResponseEvent",
"description": "Chat completion response event."
"description": "An event during chat completion generation."
},
{
"name": "ChatCompletionResponseEventType",
"description": ""
"description": "Types of events that can occur during chat completion."
},
{
"name": "ChatCompletionResponseStreamChunk",
"description": ""
"description": "A chunk of a streamed chat completion response."
},
{
"name": "Checkpoint",
@ -7911,7 +7959,7 @@
},
{
"name": "CompletionMessage",
"description": ""
"description": "A message containing the model's (assistant) response in a chat conversation."
},
{
"name": "CompletionRequest",
@ -7919,11 +7967,11 @@
},
{
"name": "CompletionResponse",
"description": "Completion response."
"description": "Response from a completion request."
},
{
"name": "CompletionResponseStreamChunk",
"description": "streamed completion response."
"description": "A chunk of a streamed completion response."
},
{
"name": "ContentDelta",
@ -7977,7 +8025,7 @@
},
{
"name": "EmbeddingsResponse",
"description": ""
"description": "Response containing generated embeddings."
},
{
"name": "Eval"
@ -8011,7 +8059,7 @@
},
{
"name": "GrammarResponseFormat",
"description": ""
"description": "Configuration for grammar-guided response generation."
},
{
"name": "GreedySamplingStrategy",
@ -8069,7 +8117,7 @@
},
{
"name": "JsonSchemaResponseFormat",
"description": ""
"description": "Configuration for JSON schema-guided response generation."
},
{
"name": "JsonType",
@ -8434,7 +8482,7 @@
},
{
"name": "SystemMessage",
"description": ""
"description": "A system message providing instructions or context to the model."
},
{
"name": "Telemetry"
@ -8449,7 +8497,7 @@
},
{
"name": "TokenLogProbs",
"description": ""
"description": "Log probabilities for generated tokens."
},
{
"name": "Tool",
@ -8469,7 +8517,7 @@
},
{
"name": "ToolChoice",
"description": ""
"description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
},
{
"name": "ToolDef",
@ -8516,7 +8564,7 @@
},
{
"name": "ToolResponseMessage",
"description": ""
"description": "A message representing the result of a tool invocation."
},
{
"name": "ToolRuntime"
@ -8555,7 +8603,7 @@
},
{
"name": "UserMessage",
"description": ""
"description": "A message from the user in a chat conversation."
},
{
"name": "VectorDB",

View file

@ -291,7 +291,8 @@ paths:
'200':
description: >-
An array of embeddings, one for each content. Each embedding is a list
of floats.
of floats. The dimensionality of the embedding is model-specific; you
can check model metadata using /models/{model_id}
content:
application/json:
schema:
@ -1396,20 +1397,34 @@ components:
type: string
const: assistant
default: assistant
description: >-
Must be "assistant" to identify this as the model's response
content:
$ref: '#/components/schemas/InterleavedContent'
description: The content of the model's response
stop_reason:
$ref: '#/components/schemas/StopReason'
description: >-
Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
The model finished generating the entire response. - `StopReason.end_of_message`:
The model finished generating but generated a partial response -- usually,
a tool call. The user may call the tool and continue the conversation
with the tool's response. - `StopReason.out_of_tokens`: The model ran
out of token budget.
tool_calls:
type: array
items:
$ref: '#/components/schemas/ToolCall'
description: >-
List of tool calls. Each tool call is a ToolCall object.
additionalProperties: false
required:
- role
- content
- stop_reason
- tool_calls
title: >-
A message containing the model's (assistant) response in a chat conversation.
GrammarResponseFormat:
type: object
properties:
@ -1417,6 +1432,8 @@ components:
type: string
const: grammar
default: grammar
description: >-
Must be "grammar" to identify this format type
bnf:
type: object
additionalProperties:
@ -1427,10 +1444,14 @@ components:
- type: string
- type: array
- type: object
description: >-
The BNF grammar specification the response should conform to
additionalProperties: false
required:
- type
- bnf
title: >-
Configuration for grammar-guided response generation.
GreedySamplingStrategy:
type: object
properties:
@ -1484,6 +1505,8 @@ components:
type: string
const: json_schema
default: json_schema
description: >-
Must be "json_schema" to identify this format type
json_schema:
type: object
additionalProperties:
@ -1494,10 +1517,15 @@ components:
- type: string
- type: array
- type: object
description: >-
The JSON schema the response should conform to. In a Python SDK, this
is often a `pydantic` model.
additionalProperties: false
required:
- type
- json_schema
title: >-
Configuration for JSON schema-guided response generation.
Message:
oneOf:
- $ref: '#/components/schemas/UserMessage'
@ -1556,12 +1584,20 @@ components:
type: string
const: system
default: system
description: >-
Must be "system" to identify this as a system message
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the "system prompt". If multiple system messages are provided,
they are concatenated. The underlying Llama Stack code may also add other
system messages (for example, for formatting tool definitions).
additionalProperties: false
required:
- role
- content
title: >-
A system message providing instructions or context to the model.
TextContentItem:
type: object
properties:
@ -1619,6 +1655,10 @@ components:
enum:
- auto
- required
title: >-
Whether tool use is required or automatic. This is a hint to the model which
may not be followed. It depends on the Instruction Following capabilities
of the model.
ToolDefinition:
type: object
properties:
@ -1691,20 +1731,28 @@ components:
type: string
const: tool
default: tool
description: >-
Must be "tool" to identify this as a tool response
call_id:
type: string
description: >-
Unique identifier for the tool call this response is for
tool_name:
oneOf:
- $ref: '#/components/schemas/BuiltinTool'
- type: string
description: Name of the tool that was called
content:
$ref: '#/components/schemas/InterleavedContent'
description: The response content from the tool
additionalProperties: false
required:
- role
- call_id
- tool_name
- content
title: >-
A message representing the result of a tool invocation.
TopKSamplingStrategy:
type: object
properties:
@ -1748,14 +1796,23 @@ components:
type: string
const: user
default: user
description: >-
Must be "user" to identify this as a user message
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the message, which can include text and other media
context:
$ref: '#/components/schemas/InterleavedContent'
description: >-
(Optional) This field is used internally by Llama Stack to pass RAG context.
This field may be removed in the API in the future.
additionalProperties: false
required:
- role
- content
title: >-
A message from the user in a chat conversation.
BatchChatCompletionRequest:
type: object
properties:
@ -1785,6 +1842,8 @@ components:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
additionalProperties: false
required:
@ -1805,13 +1864,17 @@ components:
properties:
completion_message:
$ref: '#/components/schemas/CompletionMessage'
description: The complete response message
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- completion_message
title: Response from a chat completion request.
TokenLogProbs:
type: object
properties:
@ -1819,9 +1882,12 @@ components:
type: object
additionalProperties:
type: number
description: >-
Dictionary mapping tokens to their log probabilities
additionalProperties: false
required:
- logprobs_by_token
title: Log probabilities for generated tokens.
BatchCompletionRequest:
type: object
properties:
@ -1841,6 +1907,8 @@ components:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
additionalProperties: false
required:
@ -1861,17 +1929,21 @@ components:
properties:
content:
type: string
description: The generated completion text
stop_reason:
$ref: '#/components/schemas/StopReason'
description: Reason why generation stopped
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- content
- stop_reason
title: Completion response.
title: Response from a completion request.
CancelTrainingJobRequest:
type: object
properties:
@ -1885,7 +1957,9 @@ components:
properties:
model_id:
type: string
description: The identifier of the model to use
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
messages:
type: array
items:
@ -1908,12 +1982,20 @@ components:
tool_prompt_format:
$ref: '#/components/schemas/ToolPromptFormat'
description: >-
(Optional) Specifies how tool definitions are formatted when presenting
to the model
(Optional) Instructs the model how to format tool calls. By default, Llama
Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
syntax -- a list of function calls.
response_format:
$ref: '#/components/schemas/ResponseFormat'
description: >-
(Optional) Grammar specification for guided (structured) decoding
(Optional) Grammar specification for guided (structured) decoding. There
are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
schema. Most providers support this format. - `ResponseFormat.grammar`:
The grammar is a BNF grammar. This format is more flexible, but not all
providers support it.
stream:
type: boolean
description: >-
@ -1925,6 +2007,8 @@ components:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
description: >-
(Optional) If specified, log probabilities for each token position will
@ -1938,33 +2022,47 @@ components:
properties:
event_type:
$ref: '#/components/schemas/ChatCompletionResponseEventType'
description: Type of the event
delta:
$ref: '#/components/schemas/ContentDelta'
description: >-
Content generated since last event. This can be one or more tokens, or
a tool call.
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
stop_reason:
$ref: '#/components/schemas/StopReason'
description: >-
Optional reason why generation stopped, if complete
additionalProperties: false
required:
- event_type
- delta
title: Chat completion response event.
title: >-
An event during chat completion generation.
ChatCompletionResponseEventType:
type: string
enum:
- start
- complete
- progress
title: >-
Types of events that can occur during chat completion.
ChatCompletionResponseStreamChunk:
type: object
properties:
event:
$ref: '#/components/schemas/ChatCompletionResponseEvent'
description: The event containing the new content
additionalProperties: false
required:
- event
title: >-
A chunk of a streamed chat completion response.
ContentDelta:
oneOf:
- $ref: '#/components/schemas/TextDelta'
@ -2033,7 +2131,9 @@ components:
properties:
model_id:
type: string
description: The identifier of the model to use
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
content:
$ref: '#/components/schemas/InterleavedContent'
description: The content to generate a completion for
@ -2056,6 +2156,8 @@ components:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
description: >-
(Optional) If specified, log probabilities for each token position will
@ -2069,16 +2171,23 @@ components:
properties:
delta:
type: string
description: >-
New content generated since last chunk. This can be one or more tokens.
stop_reason:
$ref: '#/components/schemas/StopReason'
description: >-
Optional reason why generation stopped, if complete
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- delta
title: streamed completion response.
title: >-
A chunk of a streamed completion response.
AgentConfig:
type: object
properties:
@ -2633,14 +2742,17 @@ components:
properties:
model_id:
type: string
description: The identifier of the model to use
description: >-
The identifier of the model to use. The model must be an embedding model
registered with Llama Stack and available via the /models endpoint.
contents:
type: array
items:
$ref: '#/components/schemas/InterleavedContent'
description: >-
List of contents to generate embeddings for. Note that content can be
multimodal.
multimodal. The behavior depends on the model and provider. Some models
may only support text.
additionalProperties: false
required:
- model_id
@ -2654,9 +2766,15 @@ components:
type: array
items:
type: number
description: >-
List of embedding vectors, one per input content. Each embedding is a
list of floats. The dimensionality of the embedding is model-specific;
you can check model metadata using /models/{model_id}
additionalProperties: false
required:
- embeddings
title: >-
Response containing generated embeddings.
AgentCandidate:
type: object
properties:
@ -4833,25 +4951,30 @@ tags:
- name: ChatCompletionRequest
description: ''
- name: ChatCompletionResponse
description: ''
description: Response from a chat completion request.
- name: ChatCompletionResponseEvent
description: Chat completion response event.
description: >-
An event during chat completion generation.
- name: ChatCompletionResponseEventType
description: ''
description: >-
Types of events that can occur during chat completion.
- name: ChatCompletionResponseStreamChunk
description: ''
description: >-
A chunk of a streamed chat completion response.
- name: Checkpoint
description: Checkpoint created during training runs
- name: CompletionInputType
description: ''
- name: CompletionMessage
description: ''
description: >-
A message containing the model's (assistant) response in a chat conversation.
- name: CompletionRequest
description: ''
- name: CompletionResponse
description: Completion response.
description: Response from a completion request.
- name: CompletionResponseStreamChunk
description: streamed completion response.
description: >-
A chunk of a streamed completion response.
- name: ContentDelta
description: ''
- name: CreateAgentRequest
@ -4877,7 +5000,8 @@ tags:
- name: EmbeddingsRequest
description: ''
- name: EmbeddingsResponse
description: ''
description: >-
Response containing generated embeddings.
- name: Eval
- name: EvalCandidate
description: ''
@ -4893,7 +5017,8 @@ tags:
- name: Event
description: ''
- name: GrammarResponseFormat
description: ''
description: >-
Configuration for grammar-guided response generation.
- name: GreedySamplingStrategy
description: ''
- name: HealthInfo
@ -4921,7 +5046,8 @@ tags:
- name: JobStatus
description: ''
- name: JsonSchemaResponseFormat
description: ''
description: >-
Configuration for JSON schema-guided response generation.
- name: JsonType
description: ''
- name: LLMAsJudgeScoringFnParams
@ -5104,14 +5230,15 @@ tags:
Response from the synthetic data generation. Batch of (prompt, response, score)
tuples that pass the threshold.
- name: SystemMessage
description: ''
description: >-
A system message providing instructions or context to the model.
- name: Telemetry
- name: TextContentItem
description: ''
- name: TextDelta
description: ''
- name: TokenLogProbs
description: ''
description: Log probabilities for generated tokens.
- name: Tool
description: ''
- name: ToolCall
@ -5121,7 +5248,10 @@ tags:
- name: ToolCallParseStatus
description: ''
- name: ToolChoice
description: ''
description: >-
Whether tool use is required or automatic. This is a hint to the model which
may not be followed. It depends on the Instruction Following capabilities of
the model.
- name: ToolDef
description: ''
- name: ToolDefinition
@ -5166,7 +5296,8 @@ tags:
- name: ToolResponse
description: ''
- name: ToolResponseMessage
description: ''
description: >-
A message representing the result of a tool invocation.
- name: ToolRuntime
- name: TopKSamplingStrategy
description: ''
@ -5186,7 +5317,8 @@ tags:
- name: UnstructuredLogEvent
description: ''
- name: UserMessage
description: ''
description: >-
A message from the user in a chat conversation.
- name: VectorDB
description: ''
- name: VectorDBs

View file

@ -35,11 +35,23 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
class LogProbConfig(BaseModel):
"""
:param top_k: How many tokens (for each position) to return log probabilities for.
"""
top_k: Optional[int] = 0
@json_schema_type
class QuantizationType(Enum):
"""Type of model quantization to run inference with.
:cvar bf16: BFloat16 typically this means _no_ quantization
:cvar fp8: 8-bit floating point quantization
:cvar int4: 4-bit integer quantization
"""
bf16 = "bf16"
fp8 = "fp8"
int4 = "int4"
@ -57,6 +69,12 @@ class Bf16QuantizationConfig(BaseModel):
@json_schema_type
class Int4QuantizationConfig(BaseModel):
"""Configuration for 4-bit integer quantization.
:param type: Must be "int4" to identify this quantization type
:param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
"""
type: Literal["int4"] = "int4"
scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
@ -69,6 +87,13 @@ QuantizationConfig = Annotated[
@json_schema_type
class UserMessage(BaseModel):
"""A message from the user in a chat conversation.
:param role: Must be "user" to identify this as a user message
:param content: The content of the message, which can include text and other media
:param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
"""
role: Literal["user"] = "user"
content: InterleavedContent
context: Optional[InterleavedContent] = None
@ -76,15 +101,27 @@ class UserMessage(BaseModel):
@json_schema_type
class SystemMessage(BaseModel):
"""A system message providing instructions or context to the model.
:param role: Must be "system" to identify this as a system message
:param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
"""
role: Literal["system"] = "system"
content: InterleavedContent
@json_schema_type
class ToolResponseMessage(BaseModel):
"""A message representing the result of a tool invocation.
:param role: Must be "tool" to identify this as a tool response
:param call_id: Unique identifier for the tool call this response is for
:param tool_name: Name of the tool that was called
:param content: The response content from the tool
"""
role: Literal["tool"] = "tool"
# it was nice to re-use the ToolResponse type, but having all messages
# have a `content` type makes things nicer too
call_id: str
tool_name: Union[BuiltinTool, str]
content: InterleavedContent
@ -92,6 +129,17 @@ class ToolResponseMessage(BaseModel):
@json_schema_type
class CompletionMessage(BaseModel):
"""A message containing the model's (assistant) response in a chat conversation.
:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param stop_reason: Reason why the model stopped generating. Options are:
- `StopReason.end_of_turn`: The model finished generating the entire response.
- `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
- `StopReason.out_of_tokens`: The model ran out of token budget.
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
"""
role: Literal["assistant"] = "assistant"
content: InterleavedContent
stop_reason: StopReason
@ -131,17 +179,35 @@ class ToolResponse(BaseModel):
@json_schema_type
class ToolChoice(Enum):
"""Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
:cvar auto: The model may use tools if it determines that is appropriate.
:cvar required: The model must use tools.
"""
auto = "auto"
required = "required"
@json_schema_type
class TokenLogProbs(BaseModel):
"""Log probabilities for generated tokens.
:param logprobs_by_token: Dictionary mapping tokens to their log probabilities
"""
logprobs_by_token: Dict[str, float]
@json_schema_type
class ChatCompletionResponseEventType(Enum):
"""Types of events that can occur during chat completion.
:cvar start: Inference has started
:cvar complete: Inference is complete and a full response is available
:cvar progress: Inference is in progress and a partial response is available
"""
start = "start"
complete = "complete"
progress = "progress"
@ -149,7 +215,13 @@ class ChatCompletionResponseEventType(Enum):
@json_schema_type
class ChatCompletionResponseEvent(BaseModel):
"""Chat completion response event."""
"""An event during chat completion generation.
:param event_type: Type of the event
:param delta: Content generated since last event. This can be one or more tokens, or a tool call.
:param logprobs: Optional log probabilities for generated tokens
:param stop_reason: Optional reason why generation stopped, if complete
"""
event_type: ChatCompletionResponseEventType
delta: ContentDelta
@ -159,12 +231,24 @@ class ChatCompletionResponseEvent(BaseModel):
@json_schema_type
class ResponseFormatType(Enum):
"""Types of formats for structured (guided) decoding.
:cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
:cvar grammar: Response should conform to a BNF grammar
"""
json_schema = "json_schema"
grammar = "grammar"
@json_schema_type
class JsonSchemaResponseFormat(BaseModel):
"""Configuration for JSON schema-guided response generation.
:param type: Must be "json_schema" to identify this format type
:param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
"""
type: Literal[ResponseFormatType.json_schema.value] = (
ResponseFormatType.json_schema.value
)
@ -173,6 +257,12 @@ class JsonSchemaResponseFormat(BaseModel):
@json_schema_type
class GrammarResponseFormat(BaseModel):
"""Configuration for grammar-guided response generation.
:param type: Must be "grammar" to identify this format type
:param bnf: The BNF grammar specification the response should conform to
"""
type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
bnf: Dict[str, Any]
@ -186,19 +276,24 @@ ResponseFormat = register_schema(
)
# This is an internally used class
class CompletionRequest(BaseModel):
model: str
content: InterleavedContent
sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class CompletionResponse(BaseModel):
"""Completion response."""
"""Response from a completion request.
:param content: The generated completion text
:param stop_reason: Reason why generation stopped
:param logprobs: Optional log probabilities for generated tokens
"""
content: str
stop_reason: StopReason
@ -207,41 +302,60 @@ class CompletionResponse(BaseModel):
@json_schema_type
class CompletionResponseStreamChunk(BaseModel):
"""streamed completion response."""
"""A chunk of a streamed completion response.
:param delta: New content generated since last chunk. This can be one or more tokens.
:param stop_reason: Optional reason why generation stopped, if complete
:param logprobs: Optional log probabilities for generated tokens
"""
delta: str
stop_reason: Optional[StopReason] = None
logprobs: Optional[List[TokenLogProbs]] = None
# This is an internally used class
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class ChatCompletionResponseStreamChunk(BaseModel):
"""A chunk of a streamed chat completion response.
:param event: The event containing the new content
"""
event: ChatCompletionResponseEvent
@json_schema_type
class ChatCompletionResponse(BaseModel):
"""Response from a chat completion request.
:param completion_message: The complete response message
:param logprobs: Optional log probabilities for generated tokens
"""
completion_message: CompletionMessage
logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type
class EmbeddingsResponse(BaseModel):
"""Response containing generated embeddings.
:param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
embeddings: List[List[float]]
@ -266,7 +380,7 @@ class Inference(Protocol):
) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
"""Generate a completion for the given content using the specified model.
:param model_id: The identifier of the model to use
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param content: The content to generate a completion for
:param sampling_params: (Optional) Parameters to control the sampling strategy
:param response_format: (Optional) Grammar specification for guided (structured) decoding
@ -294,13 +408,18 @@ class Inference(Protocol):
]:
"""Generate a chat completion for the given messages using the specified model.
:param model_id: The identifier of the model to use
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation
:param sampling_params: Parameters to control the sampling strategy
:param tools: (Optional) List of tool definitions available to the model
:param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
:param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
:param response_format: (Optional) Grammar specification for guided (structured) decoding
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
:param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
- `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
- `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
:returns: If stream=False, returns a ChatCompletionResponse with the full completion.
@ -316,8 +435,8 @@ class Inference(Protocol):
) -> EmbeddingsResponse:
"""Generate embeddings for content pieces using the specified model.
:param model_id: The identifier of the model to use
:param contents: List of contents to generate embeddings for. Note that content can be multimodal.
:returns: An array of embeddings, one for each content. Each embedding is a list of floats.
:param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text.
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
...