diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 1a59369cb..48109e5d8 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification # noqa: E402
+def str_presenter(dumper, data):
+ if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
+ "#/components/schemas/"
+ ):
+ style = None
+ else:
+ style = ">" if "\n" in data or len(data) > 40 else None
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
+
+
def main(output_dir: str):
output_dir = Path(output_dir)
if not output_dir.exists():
@@ -69,7 +79,8 @@ def main(output_dir: str):
y.sequence_dash_offset = 2
y.width = 80
y.allow_unicode = True
- y.explicit_start = True
+ y.representer.add_representer(str, str_presenter)
+
y.dump(
spec.get_json(),
fp,
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 317b895b5..d8e0d81ed 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -8,6 +8,7 @@ import collections
import hashlib
import ipaddress
import typing
+from dataclasses import field, make_dataclass
from typing import Any, Dict, Set, Union
from ..strong_typing.core import JsonType
@@ -276,6 +277,20 @@ class StatusResponse:
examples: List[Any] = dataclasses.field(default_factory=list)
+def create_docstring_for_request(
+ request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
+) -> str:
+ """Creates a ReST-style docstring for a dynamically generated request dataclass."""
+ lines = ["\n"] # Short description
+
+ # Add parameter documentation in ReST format
+ for name, type_ in fields:
+ desc = doc_params.get(name, "")
+ lines.append(f":param {name}: {desc}")
+
+ return "\n".join(lines)
+
+
class ResponseBuilder:
content_builder: ContentBuilder
@@ -493,11 +508,24 @@ class Generator:
first = next(iter(op.request_params))
request_name, request_type = first
- from dataclasses import make_dataclass
-
op_name = "".join(word.capitalize() for word in op.name.split("_"))
request_name = f"{op_name}Request"
- request_type = make_dataclass(request_name, op.request_params)
+ fields = [
+ (
+ name,
+ type_,
+ )
+ for name, type_ in op.request_params
+ ]
+ request_type = make_dataclass(
+ request_name,
+ fields,
+ namespace={
+ "__doc__": create_docstring_for_request(
+ request_name, fields, doc_params
+ )
+ },
+ )
requestBody = RequestBody(
content={
diff --git a/docs/openapi_generator/strong_typing/schema.py b/docs/openapi_generator/strong_typing/schema.py
index 826efdb4a..f4393041f 100644
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@@ -531,6 +531,7 @@ class JsonSchemaGenerator:
# add property docstring if available
property_doc = property_docstrings.get(property_name)
if property_doc:
+ # print(output_name, property_doc)
property_def.pop("title", None)
property_def["description"] = property_doc
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 5998963d2..b720bef21 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -190,7 +190,7 @@
"post": {
"responses": {
"200": {
- "description": "Chat completion response. **OR** SSE-stream of these events.",
+ "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
"content": {
"text/event-stream": {
"schema": {
@@ -210,6 +210,7 @@
"tags": [
"Inference"
],
+ "summary": "Generate a chat completion for the given messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@@ -227,7 +228,7 @@
"post": {
"responses": {
"200": {
- "description": "Completion response. **OR** streamed completion response.",
+ "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
"content": {
"text/event-stream": {
"schema": {
@@ -247,6 +248,7 @@
"tags": [
"Inference"
],
+ "summary": "Generate a completion for the given content using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@@ -485,7 +487,7 @@
"post": {
"responses": {
"200": {
- "description": "OK",
+ "description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
"content": {
"application/json": {
"schema": {
@@ -498,6 +500,7 @@
"tags": [
"Inference"
],
+ "summary": "Generate embeddings for content pieces using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@@ -2372,6 +2375,46 @@
"tool_calls"
]
},
+ "GrammarResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "grammar",
+ "default": "grammar"
+ },
+ "bnf": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "bnf"
+ ]
+ },
"GreedySamplingStrategy": {
"type": "object",
"properties": {
@@ -2447,6 +2490,46 @@
}
}
},
+ "JsonSchemaResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "json_schema",
+ "default": "json_schema"
+ },
+ "json_schema": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "json_schema"
+ ]
+ },
"Message": {
"oneOf": [
{
@@ -2472,6 +2555,23 @@
}
}
},
+ "ResponseFormat": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+ },
+ {
+ "$ref": "#/components/schemas/GrammarResponseFormat"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+ "grammar": "#/components/schemas/GrammarResponseFormat"
+ }
+ }
+ },
"SamplingParams": {
"type": "object",
"properties": {
@@ -2865,6 +2965,9 @@
"tool_prompt_format": {
"$ref": "#/components/schemas/ToolPromptFormat"
},
+ "response_format": {
+ "$ref": "#/components/schemas/ResponseFormat"
+ },
"logprobs": {
"type": "object",
"properties": {
@@ -2885,16 +2988,49 @@
"BatchChatCompletionResponse": {
"type": "object",
"properties": {
- "completion_message_batch": {
+ "batch": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/CompletionMessage"
+ "$ref": "#/components/schemas/ChatCompletionResponse"
}
}
},
"additionalProperties": false,
"required": [
- "completion_message_batch"
+ "batch"
+ ]
+ },
+ "ChatCompletionResponse": {
+ "type": "object",
+ "properties": {
+ "completion_message": {
+ "$ref": "#/components/schemas/CompletionMessage"
+ },
+ "logprobs": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/TokenLogProbs"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "completion_message"
+ ]
+ },
+ "TokenLogProbs": {
+ "type": "object",
+ "properties": {
+ "logprobs_by_token": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "number"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "logprobs_by_token"
]
},
"BatchCompletionRequest": {
@@ -2912,6 +3048,9 @@
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams"
},
+ "response_format": {
+ "$ref": "#/components/schemas/ResponseFormat"
+ },
"logprobs": {
"type": "object",
"properties": {
@@ -2932,18 +3071,41 @@
"BatchCompletionResponse": {
"type": "object",
"properties": {
- "completion_message_batch": {
+ "batch": {
"type": "array",
"items": {
- "$ref": "#/components/schemas/CompletionMessage"
+ "$ref": "#/components/schemas/CompletionResponse"
}
}
},
"additionalProperties": false,
"required": [
- "completion_message_batch"
+ "batch"
]
},
+ "CompletionResponse": {
+ "type": "object",
+ "properties": {
+ "content": {
+ "type": "string"
+ },
+ "stop_reason": {
+ "$ref": "#/components/schemas/StopReason"
+ },
+ "logprobs": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/TokenLogProbs"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "content",
+ "stop_reason"
+ ],
+ "title": "Completion response."
+ },
"CancelTrainingJobRequest": {
"type": "object",
"properties": {
@@ -2956,135 +3118,46 @@
"job_uuid"
]
},
- "GrammarResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "grammar",
- "default": "grammar"
- },
- "bnf": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "bnf"
- ]
- },
- "JsonSchemaResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "json_schema",
- "default": "json_schema"
- },
- "json_schema": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "json_schema"
- ]
- },
- "ResponseFormat": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/JsonSchemaResponseFormat"
- },
- {
- "$ref": "#/components/schemas/GrammarResponseFormat"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
- "grammar": "#/components/schemas/GrammarResponseFormat"
- }
- }
- },
"ChatCompletionRequest": {
"type": "object",
"properties": {
"model_id": {
- "type": "string"
+ "type": "string",
+ "description": "The identifier of the model to use"
},
"messages": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
- }
+ },
+ "description": "List of messages in the conversation"
},
"sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "Parameters to control the sampling strategy"
},
"tools": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolDefinition"
- }
+ },
+ "description": "(Optional) List of tool definitions available to the model"
},
"tool_choice": {
- "$ref": "#/components/schemas/ToolChoice"
+ "$ref": "#/components/schemas/ToolChoice",
+ "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
},
"tool_prompt_format": {
- "$ref": "#/components/schemas/ToolPromptFormat"
+ "$ref": "#/components/schemas/ToolPromptFormat",
+ "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
},
"response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
+ "$ref": "#/components/schemas/ResponseFormat",
+ "description": "(Optional) Grammar specification for guided (structured) decoding"
},
"stream": {
- "type": "boolean"
+ "type": "boolean",
+ "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
@@ -3094,7 +3167,8 @@
"default": 0
}
},
- "additionalProperties": false
+ "additionalProperties": false,
+ "description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
@@ -3103,25 +3177,6 @@
"messages"
]
},
- "ChatCompletionResponse": {
- "type": "object",
- "properties": {
- "completion_message": {
- "$ref": "#/components/schemas/CompletionMessage"
- },
- "logprobs": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/TokenLogProbs"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "completion_message"
- ],
- "title": "Chat completion response."
- },
"ChatCompletionResponseEvent": {
"type": "object",
"properties": {
@@ -3166,8 +3221,7 @@
"additionalProperties": false,
"required": [
"event"
- ],
- "title": "SSE-stream of these events."
+ ]
},
"ContentDelta": {
"oneOf": [
@@ -3227,21 +3281,6 @@
"text"
]
},
- "TokenLogProbs": {
- "type": "object",
- "properties": {
- "logprobs_by_token": {
- "type": "object",
- "additionalProperties": {
- "type": "number"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "logprobs_by_token"
- ]
- },
"ToolCallDelta": {
"type": "object",
"properties": {
@@ -3284,19 +3323,24 @@
"type": "object",
"properties": {
"model_id": {
- "type": "string"
+ "type": "string",
+ "description": "The identifier of the model to use"
},
"content": {
- "$ref": "#/components/schemas/InterleavedContent"
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content to generate a completion for"
},
"sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "(Optional) Parameters to control the sampling strategy"
},
"response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
+ "$ref": "#/components/schemas/ResponseFormat",
+ "description": "(Optional) Grammar specification for guided (structured) decoding"
},
"stream": {
- "type": "boolean"
+ "type": "boolean",
+ "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
@@ -3306,7 +3350,8 @@
"default": 0
}
},
- "additionalProperties": false
+ "additionalProperties": false,
+ "description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
@@ -3315,29 +3360,6 @@
"content"
]
},
- "CompletionResponse": {
- "type": "object",
- "properties": {
- "content": {
- "type": "string"
- },
- "stop_reason": {
- "$ref": "#/components/schemas/StopReason"
- },
- "logprobs": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/TokenLogProbs"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "content",
- "stop_reason"
- ],
- "title": "Completion response."
- },
"CompletionResponseStreamChunk": {
"type": "object",
"properties": {
@@ -4241,13 +4263,15 @@
"type": "object",
"properties": {
"model_id": {
- "type": "string"
+ "type": "string",
+ "description": "The identifier of the model to use"
},
"contents": {
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContent"
- }
+ },
+ "description": "List of contents to generate embeddings for. Note that content can be multimodal."
}
},
"additionalProperties": false,
@@ -7863,7 +7887,7 @@
},
{
"name": "ChatCompletionResponse",
- "description": "Chat completion response."
+ "description": ""
},
{
"name": "ChatCompletionResponseEvent",
@@ -7875,7 +7899,7 @@
},
{
"name": "ChatCompletionResponseStreamChunk",
- "description": "SSE-stream of these events."
+ "description": ""
},
{
"name": "Checkpoint",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 1d7c4f113..353d99d00 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1,11 +1,12 @@
----
openapi: 3.1.0
info:
title: Llama Stack Specification
version: v1
- description: "This is the specification of the Llama Stack that provides\n \
- \ a set of endpoints and their corresponding interfaces that are tailored
- to\n best leverage Llama Models."
+ description: >-
+ This is the specification of the Llama Stack that provides
+ a set of endpoints and their corresponding interfaces that are
+ tailored to
+ best leverage Llama Models.
servers:
- url: http://any-hosted-llama-stack.com
paths:
@@ -108,7 +109,9 @@ paths:
post:
responses:
'200':
- description: Chat completion response. **OR** SSE-stream of these events.
+ description: >-
+ If stream=False, returns a ChatCompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
content:
text/event-stream:
schema:
@@ -117,6 +120,8 @@ paths:
- $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
tags:
- Inference
+ summary: >-
+ Generate a chat completion for the given messages using the specified model.
parameters: []
requestBody:
content:
@@ -128,7 +133,9 @@ paths:
post:
responses:
'200':
- description: Completion response. **OR** streamed completion response.
+ description: >-
+ If stream=False, returns a CompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
content:
text/event-stream:
schema:
@@ -137,6 +144,8 @@ paths:
- $ref: '#/components/schemas/CompletionResponseStreamChunk'
tags:
- Inference
+ summary: >-
+ Generate a completion for the given content using the specified model.
parameters: []
requestBody:
content:
@@ -189,8 +198,9 @@ paths:
post:
responses:
'200':
- description: A single turn in an interaction with an Agentic System. **OR**
- streamed agent turn completion response.
+ description: >-
+ A single turn in an interaction with an Agentic System. **OR** streamed
+ agent turn completion response.
content:
text/event-stream:
schema:
@@ -279,13 +289,17 @@ paths:
post:
responses:
'200':
- description: OK
+ description: >-
+ An array of embeddings, one for each content. Each embedding is a list
+ of floats.
content:
application/json:
schema:
$ref: '#/components/schemas/EmbeddingsResponse'
tags:
- Inference
+ summary: >-
+ Generate embeddings for content pieces using the specified model.
parameters: []
requestBody:
content:
@@ -709,7 +723,8 @@ paths:
description: OK
tags:
- ToolRuntime
- summary: Index documents so they can be used by the RAG system
+ summary: >-
+ Index documents so they can be used by the RAG system
parameters: []
requestBody:
content:
@@ -1109,7 +1124,8 @@ paths:
$ref: '#/components/schemas/RAGQueryResult'
tags:
- ToolRuntime
- summary: Query the RAG system for context; typically invoked by the agent
+ summary: >-
+ Query the RAG system for context; typically invoked by the agent
parameters: []
requestBody:
content:
@@ -1341,7 +1357,8 @@ paths:
tags:
- Inspect
parameters: []
-jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
+jsonSchemaDialect: >-
+ https://json-schema.org/draft/2020-12/schema
components:
schemas:
AppendRowsRequest:
@@ -1393,6 +1410,27 @@ components:
- content
- stop_reason
- tool_calls
+ GrammarResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ const: grammar
+ default: grammar
+ bnf:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - type
+ - bnf
GreedySamplingStrategy:
type: object
properties:
@@ -1439,6 +1477,27 @@ components:
mapping:
image: '#/components/schemas/ImageContentItem'
text: '#/components/schemas/TextContentItem'
+ JsonSchemaResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ const: json_schema
+ default: json_schema
+ json_schema:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - type
+ - json_schema
Message:
oneOf:
- $ref: '#/components/schemas/UserMessage'
@@ -1452,6 +1511,15 @@ components:
system: '#/components/schemas/SystemMessage'
tool: '#/components/schemas/ToolResponseMessage'
assistant: '#/components/schemas/CompletionMessage'
+ ResponseFormat:
+ oneOf:
+ - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+ - $ref: '#/components/schemas/GrammarResponseFormat'
+ discriminator:
+ propertyName: type
+ mapping:
+ json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+ grammar: '#/components/schemas/GrammarResponseFormat'
SamplingParams:
type: object
properties:
@@ -1594,16 +1662,28 @@ components:
- json
- function_tag
- python_list
- title: This Enum refers to the prompt format for calling custom / zero shot
- tools
- description: "`json` --\n Refers to the json format for calling tools.\n\
- \ The json format takes the form like\n {\n \"type\": \"function\"\
- ,\n \"function\" : {\n \"name\": \"function_name\",\n \
- \ \"description\": \"function_description\",\n \"parameters\"\
- : {...}\n }\n }\n\n`function_tag` --\n This is an example of
- how you could define\n your own user defined format for making tool calls.\n\
- \ The function_tag format looks like this,\n (parameters)\n
- \nThe detailed prompts for each of these formats are added to llama cli"
+ title: >-
+ This Enum refers to the prompt format for calling custom / zero shot tools
+ description: >-
+ `json` --
+ Refers to the json format for calling tools.
+ The json format takes the form like
+ {
+ "type": "function",
+ "function" : {
+ "name": "function_name",
+ "description": "function_description",
+ "parameters": {...}
+ }
+ }
+
+ `function_tag` --
+ This is an example of how you could define
+ your own user defined format for making tool calls.
+ The function_tag format looks like this,
+ (parameters)
+
+ The detailed prompts for each of these formats are added to llama cli
ToolResponseMessage:
type: object
properties:
@@ -1697,6 +1777,8 @@ components:
$ref: '#/components/schemas/ToolChoice'
tool_prompt_format:
$ref: '#/components/schemas/ToolPromptFormat'
+ response_format:
+ $ref: '#/components/schemas/ResponseFormat'
logprobs:
type: object
properties:
@@ -1711,13 +1793,35 @@ components:
BatchChatCompletionResponse:
type: object
properties:
- completion_message_batch:
+ batch:
type: array
items:
- $ref: '#/components/schemas/CompletionMessage'
+ $ref: '#/components/schemas/ChatCompletionResponse'
additionalProperties: false
required:
- - completion_message_batch
+ - batch
+ ChatCompletionResponse:
+ type: object
+ properties:
+ completion_message:
+ $ref: '#/components/schemas/CompletionMessage'
+ logprobs:
+ type: array
+ items:
+ $ref: '#/components/schemas/TokenLogProbs'
+ additionalProperties: false
+ required:
+ - completion_message
+ TokenLogProbs:
+ type: object
+ properties:
+ logprobs_by_token:
+ type: object
+ additionalProperties:
+ type: number
+ additionalProperties: false
+ required:
+ - logprobs_by_token
BatchCompletionRequest:
type: object
properties:
@@ -1729,6 +1833,8 @@ components:
$ref: '#/components/schemas/InterleavedContent'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
+ response_format:
+ $ref: '#/components/schemas/ResponseFormat'
logprobs:
type: object
properties:
@@ -1743,13 +1849,29 @@ components:
BatchCompletionResponse:
type: object
properties:
- completion_message_batch:
+ batch:
type: array
items:
- $ref: '#/components/schemas/CompletionMessage'
+ $ref: '#/components/schemas/CompletionResponse'
additionalProperties: false
required:
- - completion_message_batch
+ - batch
+ CompletionResponse:
+ type: object
+ properties:
+ content:
+ type: string
+ stop_reason:
+ $ref: '#/components/schemas/StopReason'
+ logprobs:
+ type: array
+ items:
+ $ref: '#/components/schemas/TokenLogProbs'
+ additionalProperties: false
+ required:
+ - content
+ - stop_reason
+ title: Completion response.
CancelTrainingJobRequest:
type: object
properties:
@@ -1758,80 +1880,45 @@ components:
additionalProperties: false
required:
- job_uuid
- GrammarResponseFormat:
- type: object
- properties:
- type:
- type: string
- const: grammar
- default: grammar
- bnf:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - type
- - bnf
- JsonSchemaResponseFormat:
- type: object
- properties:
- type:
- type: string
- const: json_schema
- default: json_schema
- json_schema:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - type
- - json_schema
- ResponseFormat:
- oneOf:
- - $ref: '#/components/schemas/JsonSchemaResponseFormat'
- - $ref: '#/components/schemas/GrammarResponseFormat'
- discriminator:
- propertyName: type
- mapping:
- json_schema: '#/components/schemas/JsonSchemaResponseFormat'
- grammar: '#/components/schemas/GrammarResponseFormat'
ChatCompletionRequest:
type: object
properties:
model_id:
type: string
+ description: The identifier of the model to use
messages:
type: array
items:
$ref: '#/components/schemas/Message'
+ description: List of messages in the conversation
sampling_params:
$ref: '#/components/schemas/SamplingParams'
+ description: >-
+ Parameters to control the sampling strategy
tools:
type: array
items:
$ref: '#/components/schemas/ToolDefinition'
+ description: >-
+ (Optional) List of tool definitions available to the model
tool_choice:
$ref: '#/components/schemas/ToolChoice'
+ description: >-
+ (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
tool_prompt_format:
$ref: '#/components/schemas/ToolPromptFormat'
+ description: >-
+ (Optional) Specifies how tool definitions are formatted when presenting
+ to the model
response_format:
$ref: '#/components/schemas/ResponseFormat'
+ description: >-
+ (Optional) Grammar specification for guided (structured) decoding
stream:
type: boolean
+ description: >-
+ (Optional) If True, generate an SSE event stream of the response. Defaults
+ to False.
logprobs:
type: object
properties:
@@ -1839,23 +1926,13 @@ components:
type: integer
default: 0
additionalProperties: false
+ description: >-
+ (Optional) If specified, log probabilities for each token position will
+ be returned.
additionalProperties: false
required:
- model_id
- messages
- ChatCompletionResponse:
- type: object
- properties:
- completion_message:
- $ref: '#/components/schemas/CompletionMessage'
- logprobs:
- type: array
- items:
- $ref: '#/components/schemas/TokenLogProbs'
- additionalProperties: false
- required:
- - completion_message
- title: Chat completion response.
ChatCompletionResponseEvent:
type: object
properties:
@@ -1888,7 +1965,6 @@ components:
additionalProperties: false
required:
- event
- title: SSE-stream of these events.
ContentDelta:
oneOf:
- $ref: '#/components/schemas/TextDelta'
@@ -1927,16 +2003,6 @@ components:
required:
- type
- text
- TokenLogProbs:
- type: object
- properties:
- logprobs_by_token:
- type: object
- additionalProperties:
- type: number
- additionalProperties: false
- required:
- - logprobs_by_token
ToolCallDelta:
type: object
properties:
@@ -1967,14 +2033,23 @@ components:
properties:
model_id:
type: string
+ description: The identifier of the model to use
content:
$ref: '#/components/schemas/InterleavedContent'
+ description: The content to generate a completion for
sampling_params:
$ref: '#/components/schemas/SamplingParams'
+ description: >-
+ (Optional) Parameters to control the sampling strategy
response_format:
$ref: '#/components/schemas/ResponseFormat'
+ description: >-
+ (Optional) Grammar specification for guided (structured) decoding
stream:
type: boolean
+ description: >-
+ (Optional) If True, generate an SSE event stream of the response. Defaults
+ to False.
logprobs:
type: object
properties:
@@ -1982,26 +2057,13 @@ components:
type: integer
default: 0
additionalProperties: false
+ description: >-
+ (Optional) If specified, log probabilities for each token position will
+ be returned.
additionalProperties: false
required:
- model_id
- content
- CompletionResponse:
- type: object
- properties:
- content:
- type: string
- stop_reason:
- $ref: '#/components/schemas/StopReason'
- logprobs:
- type: array
- items:
- $ref: '#/components/schemas/TokenLogProbs'
- additionalProperties: false
- required:
- - content
- - stop_reason
- title: Completion response.
CompletionResponseStreamChunk:
type: object
properties:
@@ -2558,7 +2620,8 @@ components:
- output_message
- output_attachments
- started_at
- title: A single turn in an interaction with an Agentic System.
+ title: >-
+ A single turn in an interaction with an Agentic System.
ViolationLevel:
type: string
enum:
@@ -2570,10 +2633,14 @@ components:
properties:
model_id:
type: string
+ description: The identifier of the model to use
contents:
type: array
items:
$ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ List of contents to generate embeddings for. Note that content can be
+ multimodal.
additionalProperties: false
required:
- model_id
@@ -2845,7 +2912,8 @@ components:
- session_name
- turns
- started_at
- title: A single session of an interaction with an Agentic System.
+ title: >-
+ A single session of an interaction with an Agentic System.
AgentStepResponse:
type: object
properties:
@@ -3194,7 +3262,8 @@ components:
- provider_resource_id
- provider_id
- type
- title: A safety shield resource that can be used to check content
+ title: >-
+ A safety shield resource that can be used to check content
Span:
type: object
properties:
@@ -4684,8 +4753,9 @@ components:
additionalProperties: false
required:
- synthetic_data
- title: Response from the synthetic data generation. Batch of (prompt, response,
- score) tuples that pass the threshold.
+ title: >-
+ Response from the synthetic data generation. Batch of (prompt, response, score)
+ tuples that pass the threshold.
VersionInfo:
type: object
properties:
@@ -4763,13 +4833,13 @@ tags:
- name: ChatCompletionRequest
description: ''
- name: ChatCompletionResponse
- description: Chat completion response.
+ description: ''
- name: ChatCompletionResponseEvent
description: Chat completion response event.
- name: ChatCompletionResponseEventType
description: ''
- name: ChatCompletionResponseStreamChunk
- description: SSE-stream of these events.
+ description: ''
- name: Checkpoint
description: Checkpoint created during training runs
- name: CompletionInputType
@@ -4998,9 +5068,11 @@ tags:
- name: ScoringResult
description: ''
- name: Session
- description: A single session of an interaction with an Agentic System.
+ description: >-
+ A single session of an interaction with an Agentic System.
- name: Shield
- description: A safety shield resource that can be used to check content
+ description: >-
+ A safety shield resource that can be used to check content
- name: ShieldCallStep
description: ''
- name: Shields
@@ -5028,8 +5100,9 @@ tags:
description: ''
- name: SyntheticDataGeneration (Coming Soon)
- name: SyntheticDataGenerationResponse
- description: Response from the synthetic data generation. Batch of (prompt, response,
- score) tuples that pass the threshold.
+ description: >-
+ Response from the synthetic data generation. Batch of (prompt, response, score)
+ tuples that pass the threshold.
- name: SystemMessage
description: ''
- name: Telemetry
@@ -5067,15 +5140,29 @@ tags:
- name: ToolParameter
description: ''
- name: ToolPromptFormat
- description: "This Enum refers to the prompt format for calling custom / zero
- shot tools\n\n`json` --\n Refers to the json format for calling tools.\n\
- \ The json format takes the form like\n {\n \"type\": \"function\"\
- ,\n \"function\" : {\n \"name\": \"function_name\",\n \
- \ \"description\": \"function_description\",\n \"parameters\"\
- : {...}\n }\n }\n\n`function_tag` --\n This is an example of how
- you could define\n your own user defined format for making tool calls.\n\
- \ The function_tag format looks like this,\n (parameters)\n
- \nThe detailed prompts for each of these formats are added to llama cli"
+ description: >-
+ This Enum refers to the prompt format for calling custom / zero shot tools
+
+
+ `json` --
+ Refers to the json format for calling tools.
+ The json format takes the form like
+ {
+ "type": "function",
+ "function" : {
+ "name": "function_name",
+ "description": "function_description",
+ "parameters": {...}
+ }
+ }
+
+ `function_tag` --
+ This is an example of how you could define
+ your own user defined format for making tool calls.
+ The function_tag format looks like this,
+ (parameters)
+
+ The detailed prompts for each of these formats are added to llama cli
- name: ToolResponse
description: ''
- name: ToolResponseMessage
@@ -5090,7 +5177,8 @@ tags:
- name: TrainingConfig
description: ''
- name: Turn
- description: A single turn in an interaction with an Agentic System.
+ description: >-
+ A single turn in an interaction with an Agentic System.
- name: URL
description: ''
- name: UnionType
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index ca5ba059f..413c81c5a 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -7,13 +7,15 @@
from typing import List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
from llama_stack.apis.inference import (
- CompletionMessage,
+ ChatCompletionResponse,
+ CompletionResponse,
InterleavedContent,
LogProbConfig,
Message,
+ ResponseFormat,
SamplingParams,
ToolChoice,
ToolDefinition,
@@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
)
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
- model: str
- content_batch: List[InterleavedContent]
- sampling_params: Optional[SamplingParams] = SamplingParams()
- logprobs: Optional[LogProbConfig] = None
-
-
@json_schema_type
class BatchCompletionResponse(BaseModel):
- completion_message_batch: List[CompletionMessage]
-
-
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
- model: str
- messages_batch: List[List[Message]]
- sampling_params: Optional[SamplingParams] = SamplingParams()
-
- # zero-shot tool definitions as input to the model
- tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
- tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
- tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
- logprobs: Optional[LogProbConfig] = None
+ batch: List[CompletionResponse]
@json_schema_type
class BatchChatCompletionResponse(BaseModel):
- completion_message_batch: List[CompletionMessage]
+ batch: List[ChatCompletionResponse]
@runtime_checkable
@@ -60,6 +41,7 @@ class BatchInference(Protocol):
model: str,
content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = SamplingParams(),
+ response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ...
@@ -73,5 +55,6 @@ class BatchInference(Protocol):
tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
+ response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
) -> BatchChatCompletionResponse: ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 871f1f633..36f385eb2 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -186,7 +186,6 @@ ResponseFormat = register_schema(
)
-@json_schema_type
class CompletionRequest(BaseModel):
model: str
content: InterleavedContent
@@ -215,23 +214,6 @@ class CompletionResponseStreamChunk(BaseModel):
logprobs: Optional[List[TokenLogProbs]] = None
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
- model: str
- content_batch: List[InterleavedContent]
- sampling_params: Optional[SamplingParams] = SamplingParams()
- response_format: Optional[ResponseFormat] = None
- logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchCompletionResponse(BaseModel):
- """Batch completion response."""
-
- batch: List[CompletionResponse]
-
-
-@json_schema_type
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
@@ -249,37 +231,15 @@ class ChatCompletionRequest(BaseModel):
@json_schema_type
class ChatCompletionResponseStreamChunk(BaseModel):
- """SSE-stream of these events."""
-
event: ChatCompletionResponseEvent
@json_schema_type
class ChatCompletionResponse(BaseModel):
- """Chat completion response."""
-
completion_message: CompletionMessage
logprobs: Optional[List[TokenLogProbs]] = None
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
- model: str
- messages_batch: List[List[Message]]
- sampling_params: Optional[SamplingParams] = SamplingParams()
-
- # zero-shot tool definitions as input to the model
- tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
- tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
- tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
- logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
- batch: List[ChatCompletionResponse]
-
-
@json_schema_type
class EmbeddingsResponse(BaseModel):
embeddings: List[List[float]]
@@ -303,7 +263,19 @@ class Inference(Protocol):
response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
- ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+ ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+ """Generate a completion for the given content using the specified model.
+
+ :param model_id: The identifier of the model to use
+ :param content: The content to generate a completion for
+ :param sampling_params: (Optional) Parameters to control the sampling strategy
+ :param response_format: (Optional) Grammar specification for guided (structured) decoding
+ :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+ :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+ :returns: If stream=False, returns a CompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+ """
+ ...
@webmethod(route="/inference/chat-completion", method="POST")
async def chat_completion(
@@ -311,7 +283,6 @@ class Inference(Protocol):
model_id: str,
messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(),
- # zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
@@ -320,11 +291,33 @@ class Inference(Protocol):
logprobs: Optional[LogProbConfig] = None,
) -> Union[
ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
- ]: ...
+ ]:
+ """Generate a chat completion for the given messages using the specified model.
+
+ :param model_id: The identifier of the model to use
+ :param messages: List of messages in the conversation
+ :param sampling_params: Parameters to control the sampling strategy
+ :param tools: (Optional) List of tool definitions available to the model
+ :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+ :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
+ :param response_format: (Optional) Grammar specification for guided (structured) decoding
+ :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+ :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+ :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
+ If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+ """
+ ...
@webmethod(route="/inference/embeddings", method="POST")
async def embeddings(
self,
model_id: str,
contents: List[InterleavedContent],
- ) -> EmbeddingsResponse: ...
+ ) -> EmbeddingsResponse:
+ """Generate embeddings for content pieces using the specified model.
+
+ :param model_id: The identifier of the model to use
+ :param contents: List of contents to generate embeddings for. Note that content can be multimodal.
+ :returns: An array of embeddings, one for each content. Each embedding is a list of floats.
+ """
+ ...