Update OpenAPI generator to add param and field documentation

2025-08-07 02:58:21 +00:00 · 2025-01-28 12:27:21 -08:00 · 2025-01-28 12:27:21 -08:00 · ebfa8ad4fb
commit ebfa8ad4fb
parent 9f709387e2
7 changed files with 525 additions and 397 deletions
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server  # noqa: E402
 from .pyopenapi.utility import Specification  # noqa: E402
 def str_presenter(dumper, data):
    if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
        "#/components/schemas/"
    ):
        style = None
    else:
        style = ">" if "\n" in data or len(data) > 40 else None
    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
 def main(output_dir: str):
    output_dir = Path(output_dir)
    if not output_dir.exists():
@ -69,7 +79,8 @@ def main(output_dir: str):
        y.sequence_dash_offset = 2
        y.width = 80
        y.allow_unicode = True
-        y.explicit_start = True
+        y.representer.add_representer(str, str_presenter)
        y.dump(
            spec.get_json(),
            fp,
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -8,6 +8,7 @@ import collections
 import hashlib
 import ipaddress
 import typing
 from dataclasses import field, make_dataclass
 from typing import Any, Dict, Set, Union
 from ..strong_typing.core import JsonType
@ -276,6 +277,20 @@ class StatusResponse:
    examples: List[Any] = dataclasses.field(default_factory=list)
 def create_docstring_for_request(
    request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
 ) -> str:
    """Creates a ReST-style docstring for a dynamically generated request dataclass."""
    lines = ["\n"]  # Short description
    # Add parameter documentation in ReST format
    for name, type_ in fields:
        desc = doc_params.get(name, "")
        lines.append(f":param {name}: {desc}")
    return "\n".join(lines)
 class ResponseBuilder:
    content_builder: ContentBuilder
@ -493,11 +508,24 @@ class Generator:
            first = next(iter(op.request_params))
            request_name, request_type = first
            from dataclasses import make_dataclass
            op_name = "".join(word.capitalize() for word in op.name.split("_"))
            request_name = f"{op_name}Request"
-            request_type = make_dataclass(request_name, op.request_params)
+            fields = [
                (
                    name,
                    type_,
                )
                for name, type_ in op.request_params
            ]
            request_type = make_dataclass(
                request_name,
                fields,
                namespace={
                    "__doc__": create_docstring_for_request(
                        request_name, fields, doc_params
                    )
                },
            )
            requestBody = RequestBody(
                content={
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@ -531,6 +531,7 @@ class JsonSchemaGenerator:
            # add property docstring if available
            property_doc = property_docstrings.get(property_name)
            if property_doc:
                # print(output_name, property_doc)
                property_def.pop("title", None)
                property_def["description"] = property_doc
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@ -190,7 +190,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "Chat completion response. **OR** SSE-stream of these events.",
+                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
                        "content": {
                            "text/event-stream": {
                                "schema": {
@ -210,6 +210,7 @@
                "tags": [
                    "Inference"
                ],
                "summary": "Generate a chat completion for the given messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -227,7 +228,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "Completion response. **OR** streamed completion response.",
+                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
                        "content": {
                            "text/event-stream": {
                                "schema": {
@ -247,6 +248,7 @@
                "tags": [
                    "Inference"
                ],
                "summary": "Generate a completion for the given content using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -485,7 +487,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
                        "content": {
                            "application/json": {
                                "schema": {
@ -498,6 +500,7 @@
                "tags": [
                    "Inference"
                ],
                "summary": "Generate embeddings for content pieces using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -2372,6 +2375,46 @@
                    "tool_calls"
                ]
            },
            "GrammarResponseFormat": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "grammar",
                        "default": "grammar"
                    },
                    "bnf": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "bnf"
                ]
            },
            "GreedySamplingStrategy": {
                "type": "object",
                "properties": {
@ -2447,6 +2490,46 @@
                    }
                }
            },
            "JsonSchemaResponseFormat": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "json_schema",
                        "default": "json_schema"
                    },
                    "json_schema": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "json_schema"
                ]
            },
            "Message": {
                "oneOf": [
                    {
@ -2472,6 +2555,23 @@
                    }
                }
            },
            "ResponseFormat": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
                    },
                    {
                        "$ref": "#/components/schemas/GrammarResponseFormat"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
                        "grammar": "#/components/schemas/GrammarResponseFormat"
                    }
                }
            },
            "SamplingParams": {
                "type": "object",
                "properties": {
@ -2865,6 +2965,9 @@
                    "tool_prompt_format": {
                        "$ref": "#/components/schemas/ToolPromptFormat"
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat"
                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
@ -2885,16 +2988,49 @@
            "BatchChatCompletionResponse": {
                "type": "object",
                "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/ChatCompletionResponse"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "completion_message_batch"
+                    "batch"
                ]
            },
            "ChatCompletionResponse": {
                "type": "object",
                "properties": {
                    "completion_message": {
                        "$ref": "#/components/schemas/CompletionMessage"
                    },
                    "logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/TokenLogProbs"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "completion_message"
                ]
            },
            "TokenLogProbs": {
                "type": "object",
                "properties": {
                    "logprobs_by_token": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "number"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "logprobs_by_token"
                ]
            },
            "BatchCompletionRequest": {
@ -2912,6 +3048,9 @@
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams"
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat"
                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
@ -2932,18 +3071,41 @@
            "BatchCompletionResponse": {
                "type": "object",
                "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/CompletionResponse"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "completion_message_batch"
+                    "batch"
                ]
            },
            "CompletionResponse": {
                "type": "object",
                "properties": {
                    "content": {
                        "type": "string"
                    },
                    "stop_reason": {
                        "$ref": "#/components/schemas/StopReason"
                    },
                    "logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/TokenLogProbs"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "content",
                    "stop_reason"
                ],
                "title": "Completion response."
            },
            "CancelTrainingJobRequest": {
                "type": "object",
                "properties": {
@ -2956,135 +3118,46 @@
                    "job_uuid"
                ]
            },
            "GrammarResponseFormat": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "grammar",
                        "default": "grammar"
                    },
                    "bnf": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "bnf"
                ]
            },
            "JsonSchemaResponseFormat": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "json_schema",
                        "default": "json_schema"
                    },
                    "json_schema": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "json_schema"
                ]
            },
            "ResponseFormat": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
                    },
                    {
                        "$ref": "#/components/schemas/GrammarResponseFormat"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
                        "grammar": "#/components/schemas/GrammarResponseFormat"
                    }
                }
            },
            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
-                        "type": "string"
+                        "type": "string",
                        "description": "The identifier of the model to use"
                    },
                    "messages": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Message"
-                        }
+                        },
                        "description": "List of messages in the conversation"
                    },
                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "Parameters to control the sampling strategy"
                    },
                    "tools": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ToolDefinition"
-                        }
+                        },
                        "description": "(Optional) List of tool definitions available to the model"
                    },
                    "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice"
+                        "$ref": "#/components/schemas/ToolChoice",
                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
                    },
                    "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat"
+                        "$ref": "#/components/schemas/ToolPromptFormat",
                        "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
                    },
                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                    },
                    "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -3094,7 +3167,8 @@
                                "default": 0
                            }
                        },
-                        "additionalProperties": false
+                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    }
                },
                "additionalProperties": false,
@ -3103,25 +3177,6 @@
                    "messages"
                ]
            },
            "ChatCompletionResponse": {
                "type": "object",
                "properties": {
                    "completion_message": {
                        "$ref": "#/components/schemas/CompletionMessage"
                    },
                    "logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/TokenLogProbs"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "completion_message"
                ],
                "title": "Chat completion response."
            },
            "ChatCompletionResponseEvent": {
                "type": "object",
                "properties": {
@ -3166,8 +3221,7 @@
                "additionalProperties": false,
                "required": [
                    "event"
-                ],
+                ]
                "title": "SSE-stream of these events."
            },
            "ContentDelta": {
                "oneOf": [
@ -3227,21 +3281,6 @@
                    "text"
                ]
            },
            "TokenLogProbs": {
                "type": "object",
                "properties": {
                    "logprobs_by_token": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "number"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "logprobs_by_token"
                ]
            },
            "ToolCallDelta": {
                "type": "object",
                "properties": {
@ -3284,19 +3323,24 @@
                "type": "object",
                "properties": {
                    "model_id": {
-                        "type": "string"
+                        "type": "string",
                        "description": "The identifier of the model to use"
                    },
                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content to generate a completion for"
                    },
                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "(Optional) Parameters to control the sampling strategy"
                    },
                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                    },
                    "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -3306,7 +3350,8 @@
                                "default": 0
                            }
                        },
-                        "additionalProperties": false
+                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    }
                },
                "additionalProperties": false,
@ -3315,29 +3360,6 @@
                    "content"
                ]
            },
            "CompletionResponse": {
                "type": "object",
                "properties": {
                    "content": {
                        "type": "string"
                    },
                    "stop_reason": {
                        "$ref": "#/components/schemas/StopReason"
                    },
                    "logprobs": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/TokenLogProbs"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "content",
                    "stop_reason"
                ],
                "title": "Completion response."
            },
            "CompletionResponseStreamChunk": {
                "type": "object",
                "properties": {
@ -4241,13 +4263,15 @@
                "type": "object",
                "properties": {
                    "model_id": {
-                        "type": "string"
+                        "type": "string",
                        "description": "The identifier of the model to use"
                    },
                    "contents": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/InterleavedContent"
-                        }
+                        },
                        "description": "List of contents to generate embeddings for. Note that content can be multimodal."
                    }
                },
                "additionalProperties": false,
@ -7863,7 +7887,7 @@
        },
        {
            "name": "ChatCompletionResponse",
-            "description": "Chat completion response."
+            "description": ""
        },
        {
            "name": "ChatCompletionResponseEvent",
@ -7875,7 +7899,7 @@
        },
        {
            "name": "ChatCompletionResponseStreamChunk",
-            "description": "SSE-stream of these events."
+            "description": ""
        },
        {
            "name": "Checkpoint",
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@ -1,11 +1,12 @@
 ---
 openapi: 3.1.0
 info:
  title: Llama Stack Specification
  version: v1
-  description: "This is the specification of the Llama Stack that provides\n     \
+  description: >-
-    \           a set of endpoints and their corresponding interfaces that are tailored
+    This is the specification of the Llama Stack that provides
-    to\n                best leverage Llama Models."
+                    a set of endpoints and their corresponding interfaces that are
    tailored to
                    best leverage Llama Models.
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
@ -108,7 +109,9 @@ paths:
    post:
      responses:
        '200':
-          description: Chat completion response. **OR** SSE-stream of these events.
+          description: >-
            If stream=False, returns a ChatCompletionResponse with the full completion.
            If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
          content:
            text/event-stream:
              schema:
@ -117,6 +120,8 @@ paths:
                  - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
      tags:
        - Inference
      summary: >-
        Generate a chat completion for the given messages using the specified model.
      parameters: []
      requestBody:
        content:
@ -128,7 +133,9 @@ paths:
    post:
      responses:
        '200':
-          description: Completion response. **OR** streamed completion response.
+          description: >-
            If stream=False, returns a CompletionResponse with the full completion.
            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
          content:
            text/event-stream:
              schema:
@ -137,6 +144,8 @@ paths:
                  - $ref: '#/components/schemas/CompletionResponseStreamChunk'
      tags:
        - Inference
      summary: >-
        Generate a completion for the given content using the specified model.
      parameters: []
      requestBody:
        content:
@ -189,8 +198,9 @@ paths:
    post:
      responses:
        '200':
-          description: A single turn in an interaction with an Agentic System. **OR**
+          description: >-
-            streamed agent turn completion response.
+            A single turn in an interaction with an Agentic System. **OR** streamed
            agent turn completion response.
          content:
            text/event-stream:
              schema:
@ -279,13 +289,17 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
            An array of embeddings, one for each content. Each embedding is a list
            of floats.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EmbeddingsResponse'
      tags:
        - Inference
      summary: >-
        Generate embeddings for content pieces using the specified model.
      parameters: []
      requestBody:
        content:
@ -709,7 +723,8 @@ paths:
          description: OK
      tags:
        - ToolRuntime
-      summary: Index documents so they can be used by the RAG system
+      summary: >-
        Index documents so they can be used by the RAG system
      parameters: []
      requestBody:
        content:
@ -1109,7 +1124,8 @@ paths:
                $ref: '#/components/schemas/RAGQueryResult'
      tags:
        - ToolRuntime
-      summary: Query the RAG system for context; typically invoked by the agent
+      summary: >-
        Query the RAG system for context; typically invoked by the agent
      parameters: []
      requestBody:
        content:
@ -1341,7 +1357,8 @@ paths:
      tags:
        - Inspect
      parameters: []
-jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
+jsonSchemaDialect: >-
  https://json-schema.org/draft/2020-12/schema
 components:
  schemas:
    AppendRowsRequest:
@ -1393,6 +1410,27 @@ components:
        - content
        - stop_reason
        - tool_calls
    GrammarResponseFormat:
      type: object
      properties:
        type:
          type: string
          const: grammar
          default: grammar
        bnf:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - type
        - bnf
    GreedySamplingStrategy:
      type: object
      properties:
@ -1439,6 +1477,27 @@ components:
        mapping:
          image: '#/components/schemas/ImageContentItem'
          text: '#/components/schemas/TextContentItem'
    JsonSchemaResponseFormat:
      type: object
      properties:
        type:
          type: string
          const: json_schema
          default: json_schema
        json_schema:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - type
        - json_schema
    Message:
      oneOf:
        - $ref: '#/components/schemas/UserMessage'
@ -1452,6 +1511,15 @@ components:
          system: '#/components/schemas/SystemMessage'
          tool: '#/components/schemas/ToolResponseMessage'
          assistant: '#/components/schemas/CompletionMessage'
    ResponseFormat:
      oneOf:
        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
        - $ref: '#/components/schemas/GrammarResponseFormat'
      discriminator:
        propertyName: type
        mapping:
          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
          grammar: '#/components/schemas/GrammarResponseFormat'
    SamplingParams:
      type: object
      properties:
@ -1594,16 +1662,28 @@ components:
        - json
        - function_tag
        - python_list
-      title: This Enum refers to the prompt format for calling custom / zero shot
+      title: >-
-        tools
+        This Enum refers to the prompt format for calling custom / zero shot tools
-      description: "`json` --\n    Refers to the json format for calling tools.\n\
+      description: >-
-        \    The json format takes the form like\n    {\n        \"type\": \"function\"\
+        `json` --
-        ,\n        \"function\" : {\n            \"name\": \"function_name\",\n  \
+            Refers to the json format for calling tools.
-        \          \"description\": \"function_description\",\n            \"parameters\"\
+            The json format takes the form like
-        : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of
+            {
-        how you could define\n    your own user defined format for making tool calls.\n\
+                "type": "function",
-        \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
+                "function" : {
-        \nThe detailed prompts for each of these formats are added to llama cli"
+                    "name": "function_name",
                    "description": "function_description",
                    "parameters": {...}
                }
            }
        `function_tag` --
            This is an example of how you could define
            your own user defined format for making tool calls.
            The function_tag format looks like this,
            <function=function_name>(parameters)</function>
        The detailed prompts for each of these formats are added to llama cli
    ToolResponseMessage:
      type: object
      properties:
@ -1697,6 +1777,8 @@ components:
          $ref: '#/components/schemas/ToolChoice'
        tool_prompt_format:
          $ref: '#/components/schemas/ToolPromptFormat'
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
        logprobs:
          type: object
          properties:
@ -1711,13 +1793,35 @@ components:
    BatchChatCompletionResponse:
      type: object
      properties:
-        completion_message_batch:
+        batch:
          type: array
          items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/ChatCompletionResponse'
      additionalProperties: false
      required:
-        - completion_message_batch
+        - batch
    ChatCompletionResponse:
      type: object
      properties:
        completion_message:
          $ref: '#/components/schemas/CompletionMessage'
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
      additionalProperties: false
      required:
        - completion_message
    TokenLogProbs:
      type: object
      properties:
        logprobs_by_token:
          type: object
          additionalProperties:
            type: number
      additionalProperties: false
      required:
        - logprobs_by_token
    BatchCompletionRequest:
      type: object
      properties:
@ -1729,6 +1833,8 @@ components:
            $ref: '#/components/schemas/InterleavedContent'
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
        logprobs:
          type: object
          properties:
@ -1743,13 +1849,29 @@ components:
    BatchCompletionResponse:
      type: object
      properties:
-        completion_message_batch:
+        batch:
          type: array
          items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/CompletionResponse'
      additionalProperties: false
      required:
-        - completion_message_batch
+        - batch
    CompletionResponse:
      type: object
      properties:
        content:
          type: string
        stop_reason:
          $ref: '#/components/schemas/StopReason'
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
      additionalProperties: false
      required:
        - content
        - stop_reason
      title: Completion response.
    CancelTrainingJobRequest:
      type: object
      properties:
@ -1758,80 +1880,45 @@ components:
      additionalProperties: false
      required:
        - job_uuid
    GrammarResponseFormat:
      type: object
      properties:
        type:
          type: string
          const: grammar
          default: grammar
        bnf:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - type
        - bnf
    JsonSchemaResponseFormat:
      type: object
      properties:
        type:
          type: string
          const: json_schema
          default: json_schema
        json_schema:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - type
        - json_schema
    ResponseFormat:
      oneOf:
        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
        - $ref: '#/components/schemas/GrammarResponseFormat'
      discriminator:
        propertyName: type
        mapping:
          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
          grammar: '#/components/schemas/GrammarResponseFormat'
    ChatCompletionRequest:
      type: object
      properties:
        model_id:
          type: string
          description: The identifier of the model to use
        messages:
          type: array
          items:
            $ref: '#/components/schemas/Message'
          description: List of messages in the conversation
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: >-
            Parameters to control the sampling strategy
        tools:
          type: array
          items:
            $ref: '#/components/schemas/ToolDefinition'
          description: >-
            (Optional) List of tool definitions available to the model
        tool_choice:
          $ref: '#/components/schemas/ToolChoice'
          description: >-
            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
        tool_prompt_format:
          $ref: '#/components/schemas/ToolPromptFormat'
          description: >-
            (Optional) Specifies how tool definitions are formatted when presenting
            to the model
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
          description: >-
            (Optional) Grammar specification for guided (structured) decoding
        stream:
          type: boolean
          description: >-
            (Optional) If True, generate an SSE event stream of the response. Defaults
            to False.
        logprobs:
          type: object
          properties:
@ -1839,23 +1926,13 @@ components:
              type: integer
              default: 0
          additionalProperties: false
          description: >-
            (Optional) If specified, log probabilities for each token position will
            be returned.
      additionalProperties: false
      required:
        - model_id
        - messages
    ChatCompletionResponse:
      type: object
      properties:
        completion_message:
          $ref: '#/components/schemas/CompletionMessage'
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
      additionalProperties: false
      required:
        - completion_message
      title: Chat completion response.
    ChatCompletionResponseEvent:
      type: object
      properties:
@ -1888,7 +1965,6 @@ components:
      additionalProperties: false
      required:
        - event
      title: SSE-stream of these events.
    ContentDelta:
      oneOf:
        - $ref: '#/components/schemas/TextDelta'
@ -1927,16 +2003,6 @@ components:
      required:
        - type
        - text
    TokenLogProbs:
      type: object
      properties:
        logprobs_by_token:
          type: object
          additionalProperties:
            type: number
      additionalProperties: false
      required:
        - logprobs_by_token
    ToolCallDelta:
      type: object
      properties:
@ -1967,14 +2033,23 @@ components:
      properties:
        model_id:
          type: string
          description: The identifier of the model to use
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: The content to generate a completion for
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: >-
            (Optional) Parameters to control the sampling strategy
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
          description: >-
            (Optional) Grammar specification for guided (structured) decoding
        stream:
          type: boolean
          description: >-
            (Optional) If True, generate an SSE event stream of the response. Defaults
            to False.
        logprobs:
          type: object
          properties:
@ -1982,26 +2057,13 @@ components:
              type: integer
              default: 0
          additionalProperties: false
          description: >-
            (Optional) If specified, log probabilities for each token position will
            be returned.
      additionalProperties: false
      required:
        - model_id
        - content
    CompletionResponse:
      type: object
      properties:
        content:
          type: string
        stop_reason:
          $ref: '#/components/schemas/StopReason'
        logprobs:
          type: array
          items:
            $ref: '#/components/schemas/TokenLogProbs'
      additionalProperties: false
      required:
        - content
        - stop_reason
      title: Completion response.
    CompletionResponseStreamChunk:
      type: object
      properties:
@ -2558,7 +2620,8 @@ components:
        - output_message
        - output_attachments
        - started_at
-      title: A single turn in an interaction with an Agentic System.
+      title: >-
        A single turn in an interaction with an Agentic System.
    ViolationLevel:
      type: string
      enum:
@ -2570,10 +2633,14 @@ components:
      properties:
        model_id:
          type: string
          description: The identifier of the model to use
        contents:
          type: array
          items:
            $ref: '#/components/schemas/InterleavedContent'
          description: >-
            List of contents to generate embeddings for. Note that content can be
            multimodal.
      additionalProperties: false
      required:
        - model_id
@ -2845,7 +2912,8 @@ components:
        - session_name
        - turns
        - started_at
-      title: A single session of an interaction with an Agentic System.
+      title: >-
        A single session of an interaction with an Agentic System.
    AgentStepResponse:
      type: object
      properties:
@ -3194,7 +3262,8 @@ components:
        - provider_resource_id
        - provider_id
        - type
-      title: A safety shield resource that can be used to check content
+      title: >-
        A safety shield resource that can be used to check content
    Span:
      type: object
      properties:
@ -4684,8 +4753,9 @@ components:
      additionalProperties: false
      required:
        - synthetic_data
-      title: Response from the synthetic data generation. Batch of (prompt, response,
+      title: >-
-        score) tuples that pass the threshold.
+        Response from the synthetic data generation. Batch of (prompt, response, score)
        tuples that pass the threshold.
    VersionInfo:
      type: object
      properties:
@ -4763,13 +4833,13 @@ tags:
  - name: ChatCompletionRequest
    description: ''
  - name: ChatCompletionResponse
-    description: Chat completion response.
+    description: ''
  - name: ChatCompletionResponseEvent
    description: Chat completion response event.
  - name: ChatCompletionResponseEventType
    description: ''
  - name: ChatCompletionResponseStreamChunk
-    description: SSE-stream of these events.
+    description: ''
  - name: Checkpoint
    description: Checkpoint created during training runs
  - name: CompletionInputType
@ -4998,9 +5068,11 @@ tags:
  - name: ScoringResult
    description: ''
  - name: Session
-    description: A single session of an interaction with an Agentic System.
+    description: >-
      A single session of an interaction with an Agentic System.
  - name: Shield
-    description: A safety shield resource that can be used to check content
+    description: >-
      A safety shield resource that can be used to check content
  - name: ShieldCallStep
    description: ''
  - name: Shields
@ -5028,8 +5100,9 @@ tags:
    description: ''
  - name: SyntheticDataGeneration (Coming Soon)
  - name: SyntheticDataGenerationResponse
-    description: Response from the synthetic data generation. Batch of (prompt, response,
+    description: >-
-      score) tuples that pass the threshold.
+      Response from the synthetic data generation. Batch of (prompt, response, score)
      tuples that pass the threshold.
  - name: SystemMessage
    description: ''
  - name: Telemetry
@ -5067,15 +5140,29 @@ tags:
  - name: ToolParameter
    description: ''
  - name: ToolPromptFormat
-    description: "This Enum refers to the prompt format for calling custom / zero
+    description: >-
-      shot tools\n\n`json` --\n    Refers to the json format for calling tools.\n\
+      This Enum refers to the prompt format for calling custom / zero shot tools
-      \    The json format takes the form like\n    {\n        \"type\": \"function\"\
+
-      ,\n        \"function\" : {\n            \"name\": \"function_name\",\n    \
+
-      \        \"description\": \"function_description\",\n            \"parameters\"\
+      `json` --
-      : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of how
+          Refers to the json format for calling tools.
-      you could define\n    your own user defined format for making tool calls.\n\
+          The json format takes the form like
-      \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
+          {
-      \nThe detailed prompts for each of these formats are added to llama cli"
+              "type": "function",
              "function" : {
                  "name": "function_name",
                  "description": "function_description",
                  "parameters": {...}
              }
          }
      `function_tag` --
          This is an example of how you could define
          your own user defined format for making tool calls.
          The function_tag format looks like this,
          <function=function_name>(parameters)</function>
      The detailed prompts for each of these formats are added to llama cli
  - name: ToolResponse
    description: ''
  - name: ToolResponseMessage
@ -5090,7 +5177,8 @@ tags:
  - name: TrainingConfig
    description: ''
  - name: Turn
-    description: A single turn in an interaction with an Agentic System.
+    description: >-
      A single turn in an interaction with an Agentic System.
  - name: URL
    description: ''
  - name: UnionType
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -7,13 +7,15 @@
 from typing import List, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 from llama_stack.apis.inference import (
-    CompletionMessage,
+    ChatCompletionResponse,
    CompletionResponse,
    InterleavedContent,
    LogProbConfig,
    Message,
    ResponseFormat,
    SamplingParams,
    ToolChoice,
    ToolDefinition,
@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
 )
@json_schema_type
 class BatchCompletionRequest(BaseModel):
    model: str
    content_batch: List[InterleavedContent]
    sampling_params: Optional[SamplingParams] = SamplingParams()
    logprobs: Optional[LogProbConfig] = None
@json_schema_type
 class BatchCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
+    batch: List[CompletionResponse]
@json_schema_type
 class BatchChatCompletionRequest(BaseModel):
    model: str
    messages_batch: List[List[Message]]
    sampling_params: Optional[SamplingParams] = SamplingParams()
    # zero-shot tool definitions as input to the model
    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
    logprobs: Optional[LogProbConfig] = None
@json_schema_type
 class BatchChatCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
+    batch: List[ChatCompletionResponse]
@runtime_checkable
@ -60,6 +41,7 @@ class BatchInference(Protocol):
        model: str,
        content_batch: List[InterleavedContent],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        response_format: Optional[ResponseFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchCompletionResponse: ...
@ -73,5 +55,6 @@ class BatchInference(Protocol):
        tools: Optional[List[ToolDefinition]] = list,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        response_format: Optional[ResponseFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchChatCompletionResponse: ...
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -186,7 +186,6 @@ ResponseFormat = register_schema(
 )
@json_schema_type
 class CompletionRequest(BaseModel):
    model: str
    content: InterleavedContent
@ -215,23 +214,6 @@ class CompletionResponseStreamChunk(BaseModel):
    logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type
 class BatchCompletionRequest(BaseModel):
    model: str
    content_batch: List[InterleavedContent]
    sampling_params: Optional[SamplingParams] = SamplingParams()
    response_format: Optional[ResponseFormat] = None
    logprobs: Optional[LogProbConfig] = None
@json_schema_type
 class BatchCompletionResponse(BaseModel):
    """Batch completion response."""
    batch: List[CompletionResponse]
@json_schema_type
 class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[Message]
@ -249,37 +231,15 @@ class ChatCompletionRequest(BaseModel):
@json_schema_type
 class ChatCompletionResponseStreamChunk(BaseModel):
    """SSE-stream of these events."""
    event: ChatCompletionResponseEvent
@json_schema_type
 class ChatCompletionResponse(BaseModel):
    """Chat completion response."""
    completion_message: CompletionMessage
    logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type
 class BatchChatCompletionRequest(BaseModel):
    model: str
    messages_batch: List[List[Message]]
    sampling_params: Optional[SamplingParams] = SamplingParams()
    # zero-shot tool definitions as input to the model
    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
    logprobs: Optional[LogProbConfig] = None
@json_schema_type
 class BatchChatCompletionResponse(BaseModel):
    batch: List[ChatCompletionResponse]
@json_schema_type
 class EmbeddingsResponse(BaseModel):
    embeddings: List[List[float]]
@ -303,7 +263,19 @@ class Inference(Protocol):
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
        """Generate a completion for the given content using the specified model.
        :param model_id: The identifier of the model to use
        :param content: The content to generate a completion for
        :param sampling_params: (Optional) Parameters to control the sampling strategy
        :param response_format: (Optional) Grammar specification for guided (structured) decoding
        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
        :returns: If stream=False, returns a CompletionResponse with the full completion.
                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
        """
        ...
    @webmethod(route="/inference/chat-completion", method="POST")
    async def chat_completion(
@ -311,7 +283,6 @@ class Inference(Protocol):
        model_id: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        # zero-shot tool definitions as input to the model
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = None,
@ -320,11 +291,33 @@ class Inference(Protocol):
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[
        ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
-    ]: ...
+    ]:
        """Generate a chat completion for the given messages using the specified model.
        :param model_id: The identifier of the model to use
        :param messages: List of messages in the conversation
        :param sampling_params: Parameters to control the sampling strategy
        :param tools: (Optional) List of tool definitions available to the model
        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
        :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
        :param response_format: (Optional) Grammar specification for guided (structured) decoding
        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
        """
        ...
    @webmethod(route="/inference/embeddings", method="POST")
    async def embeddings(
        self,
        model_id: str,
        contents: List[InterleavedContent],
-    ) -> EmbeddingsResponse: ...
+    ) -> EmbeddingsResponse:
        """Generate embeddings for content pieces using the specified model.
        :param model_id: The identifier of the model to use
        :param contents: List of contents to generate embeddings for. Note that content can be multimodal.
        :returns: An array of embeddings, one for each content. Each embedding is a list of floats.
        """
        ...