Update OpenAPI generator to add param and field documentation

2025-08-06 10:42:39 +00:00 · 2025-01-28 12:27:21 -08:00 · 2025-01-28 12:27:21 -08:00 · ebfa8ad4fb
commit ebfa8ad4fb
parent 9f709387e2
7 changed files with 525 additions and 397 deletions
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server  # noqa: E402
 from .pyopenapi.utility import Specification  # noqa: E402


+def str_presenter(dumper, data):
+    if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
+        "#/components/schemas/"
+    ):
+        style = None
+    else:
+        style = ">" if "\n" in data or len(data) > 40 else None
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
+
+
 def main(output_dir: str):
    output_dir = Path(output_dir)
    if not output_dir.exists():
@ -69,7 +79,8 @@ def main(output_dir: str):
        y.sequence_dash_offset = 2
        y.width = 80
        y.allow_unicode = True
-        y.explicit_start = True
+        y.representer.add_representer(str, str_presenter)
+
        y.dump(
            spec.get_json(),
            fp,
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -8,6 +8,7 @@ import collections
 import hashlib
 import ipaddress
 import typing
+from dataclasses import field, make_dataclass
 from typing import Any, Dict, Set, Union

 from ..strong_typing.core import JsonType
@ -276,6 +277,20 @@ class StatusResponse:
    examples: List[Any] = dataclasses.field(default_factory=list)


+def create_docstring_for_request(
+    request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
+) -> str:
+    """Creates a ReST-style docstring for a dynamically generated request dataclass."""
+    lines = ["\n"]  # Short description
+
+    # Add parameter documentation in ReST format
+    for name, type_ in fields:
+        desc = doc_params.get(name, "")
+        lines.append(f":param {name}: {desc}")
+
+    return "\n".join(lines)
+
+
 class ResponseBuilder:
    content_builder: ContentBuilder

@ -493,11 +508,24 @@ class Generator:
            first = next(iter(op.request_params))
            request_name, request_type = first

-            from dataclasses import make_dataclass
-
            op_name = "".join(word.capitalize() for word in op.name.split("_"))
            request_name = f"{op_name}Request"
-            request_type = make_dataclass(request_name, op.request_params)
+            fields = [
+                (
+                    name,
+                    type_,
+                )
+                for name, type_ in op.request_params
+            ]
+            request_type = make_dataclass(
+                request_name,
+                fields,
+                namespace={
+                    "__doc__": create_docstring_for_request(
+                        request_name, fields, doc_params
+                    )
+                },
+            )

            requestBody = RequestBody(
                content={
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@ -531,6 +531,7 @@ class JsonSchemaGenerator:
            # add property docstring if available
            property_doc = property_docstrings.get(property_name)
            if property_doc:
+                # print(output_name, property_doc)
                property_def.pop("title", None)
                property_def["description"] = property_doc

--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@ -190,7 +190,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "Chat completion response. **OR** SSE-stream of these events.",
+                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
                        "content": {
                            "text/event-stream": {
                                "schema": {
@ -210,6 +210,7 @@
                "tags": [
                    "Inference"
                ],
+                "summary": "Generate a chat completion for the given messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -227,7 +228,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "Completion response. **OR** streamed completion response.",
+                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
                        "content": {
                            "text/event-stream": {
                                "schema": {
@ -247,6 +248,7 @@
                "tags": [
                    "Inference"
                ],
+                "summary": "Generate a completion for the given content using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -485,7 +487,7 @@
            "post": {
                "responses": {
                    "200": {
-                        "description": "OK",
+                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
                        "content": {
                            "application/json": {
                                "schema": {
@ -498,6 +500,7 @@
                "tags": [
                    "Inference"
                ],
+                "summary": "Generate embeddings for content pieces using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -2372,6 +2375,46 @@
                    "tool_calls"
                ]
            },
+            "GrammarResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "grammar",
+                        "default": "grammar"
+                    },
+                    "bnf": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "bnf"
+                ]
+            },
            "GreedySamplingStrategy": {
                "type": "object",
                "properties": {
@ -2447,6 +2490,46 @@
                    }
                }
            },
+            "JsonSchemaResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "json_schema",
+                        "default": "json_schema"
+                    },
+                    "json_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "json_schema"
+                ]
+            },
            "Message": {
                "oneOf": [
                    {
@ -2472,6 +2555,23 @@
                    }
                }
            },
+            "ResponseFormat": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+                    },
+                    {
+                        "$ref": "#/components/schemas/GrammarResponseFormat"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+                        "grammar": "#/components/schemas/GrammarResponseFormat"
+                    }
+                }
+            },
            "SamplingParams": {
                "type": "object",
                "properties": {
@ -2865,6 +2965,9 @@
                    "tool_prompt_format": {
                        "$ref": "#/components/schemas/ToolPromptFormat"
                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
@ -2885,16 +2988,49 @@
            "BatchChatCompletionResponse": {
                "type": "object",
                "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/ChatCompletionResponse"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "completion_message_batch"
+                    "batch"
+                ]
+            },
+            "ChatCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "completion_message": {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "completion_message"
+                ]
+            },
+            "TokenLogProbs": {
+                "type": "object",
+                "properties": {
+                    "logprobs_by_token": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "logprobs_by_token"
                ]
            },
            "BatchCompletionRequest": {
@ -2912,6 +3048,9 @@
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams"
                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
                    "logprobs": {
                        "type": "object",
                        "properties": {
@ -2932,18 +3071,41 @@
            "BatchCompletionResponse": {
                "type": "object",
                "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/CompletionResponse"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "completion_message_batch"
+                    "batch"
                ]
            },
+            "CompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "content": {
+                        "type": "string"
+                    },
+                    "stop_reason": {
+                        "$ref": "#/components/schemas/StopReason"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "content",
+                    "stop_reason"
+                ],
+                "title": "Completion response."
+            },
            "CancelTrainingJobRequest": {
                "type": "object",
                "properties": {
@ -2956,135 +3118,46 @@
                    "job_uuid"
                ]
            },
-            "GrammarResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "grammar",
-                        "default": "grammar"
-                    },
-                    "bnf": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "bnf"
-                ]
-            },
-            "JsonSchemaResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "json_schema",
-                        "default": "json_schema"
-                    },
-                    "json_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "json_schema"
-                ]
-            },
-            "ResponseFormat": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
-                    },
-                    {
-                        "$ref": "#/components/schemas/GrammarResponseFormat"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
-                        "grammar": "#/components/schemas/GrammarResponseFormat"
-                    }
-                }
-            },
            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                    },
                    "messages": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/Message"
-                        }
+                        },
+                        "description": "List of messages in the conversation"
                    },
                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "Parameters to control the sampling strategy"
                    },
                    "tools": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/ToolDefinition"
-                        }
+                        },
+                        "description": "(Optional) List of tool definitions available to the model"
                    },
                    "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice"
+                        "$ref": "#/components/schemas/ToolChoice",
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
                    },
                    "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat"
+                        "$ref": "#/components/schemas/ToolPromptFormat",
+                        "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
                    },
                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                    },
                    "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -3094,7 +3167,8 @@
                                "default": 0
                            }
                        },
-                        "additionalProperties": false
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    }
                },
                "additionalProperties": false,
@ -3103,25 +3177,6 @@
                    "messages"
                ]
            },
-            "ChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_message"
-                ],
-                "title": "Chat completion response."
-            },
            "ChatCompletionResponseEvent": {
                "type": "object",
                "properties": {
@ -3166,8 +3221,7 @@
                "additionalProperties": false,
                "required": [
                    "event"
-                ],
-                "title": "SSE-stream of these events."
+                ]
            },
            "ContentDelta": {
                "oneOf": [
@ -3227,21 +3281,6 @@
                    "text"
                ]
            },
-            "TokenLogProbs": {
-                "type": "object",
-                "properties": {
-                    "logprobs_by_token": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "number"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "logprobs_by_token"
-                ]
-            },
            "ToolCallDelta": {
                "type": "object",
                "properties": {
@ -3284,19 +3323,24 @@
                "type": "object",
                "properties": {
                    "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                    },
                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content to generate a completion for"
                    },
                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "(Optional) Parameters to control the sampling strategy"
                    },
                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                    },
                    "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -3306,7 +3350,8 @@
                                "default": 0
                            }
                        },
-                        "additionalProperties": false
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                    }
                },
                "additionalProperties": false,
@ -3315,29 +3360,6 @@
                    "content"
                ]
            },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "type": "string"
-                    },
-                    "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "Completion response."
-            },
            "CompletionResponseStreamChunk": {
                "type": "object",
                "properties": {
@ -4241,13 +4263,15 @@
                "type": "object",
                "properties": {
                    "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                    },
                    "contents": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/InterleavedContent"
-                        }
+                        },
+                        "description": "List of contents to generate embeddings for. Note that content can be multimodal."
                    }
                },
                "additionalProperties": false,
@ -7863,7 +7887,7 @@
        },
        {
            "name": "ChatCompletionResponse",
-            "description": "Chat completion response."
+            "description": ""
        },
        {
            "name": "ChatCompletionResponseEvent",
@ -7875,7 +7899,7 @@
        },
        {
            "name": "ChatCompletionResponseStreamChunk",
-            "description": "SSE-stream of these events."
+            "description": ""
        },
        {
            "name": "Checkpoint",
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@ -1,11 +1,12 @@
---
 openapi: 3.1.0
 info:
  title: Llama Stack Specification
  version: v1
-  description: "This is the specification of the Llama Stack that provides\n     \
-    \           a set of endpoints and their corresponding interfaces that are tailored
-    to\n                best leverage Llama Models."
+  description: >-
+    This is the specification of the Llama Stack that provides
+                    a set of endpoints and their corresponding interfaces that are
+    tailored to
+                    best leverage Llama Models.
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
@ -108,7 +109,9 @@ paths:
    post:
      responses:
        '200':
-          description: Chat completion response. **OR** SSE-stream of these events.
+          description: >-
+            If stream=False, returns a ChatCompletionResponse with the full completion.
+            If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
          content:
            text/event-stream:
              schema:
@ -117,6 +120,8 @@ paths:
                  - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
      tags:
        - Inference
+      summary: >-
+        Generate a chat completion for the given messages using the specified model.
      parameters: []
      requestBody:
        content:
@ -128,7 +133,9 @@ paths:
    post:
      responses:
        '200':
-          description: Completion response. **OR** streamed completion response.
+          description: >-
+            If stream=False, returns a CompletionResponse with the full completion.
+            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
          content:
            text/event-stream:
              schema:
@ -137,6 +144,8 @@ paths:
                  - $ref: '#/components/schemas/CompletionResponseStreamChunk'
      tags:
        - Inference
+      summary: >-
+        Generate a completion for the given content using the specified model.
      parameters: []
      requestBody:
        content:
@ -189,8 +198,9 @@ paths:
    post:
      responses:
        '200':
-          description: A single turn in an interaction with an Agentic System. **OR**
-            streamed agent turn completion response.
+          description: >-
+            A single turn in an interaction with an Agentic System. **OR** streamed
+            agent turn completion response.
          content:
            text/event-stream:
              schema:
@ -279,13 +289,17 @@ paths:
    post:
      responses:
        '200':
-          description: OK
+          description: >-
+            An array of embeddings, one for each content. Each embedding is a list
+            of floats.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EmbeddingsResponse'
      tags:
        - Inference
+      summary: >-
+        Generate embeddings for content pieces using the specified model.
      parameters: []
      requestBody:
        content:
@ -709,7 +723,8 @@ paths:
          description: OK
      tags:
        - ToolRuntime
-      summary: Index documents so they can be used by the RAG system
+      summary: >-
+        Index documents so they can be used by the RAG system
      parameters: []
      requestBody:
        content:
@ -1109,7 +1124,8 @@ paths:
                $ref: '#/components/schemas/RAGQueryResult'
      tags:
        - ToolRuntime
-      summary: Query the RAG system for context; typically invoked by the agent
+      summary: >-
+        Query the RAG system for context; typically invoked by the agent
      parameters: []
      requestBody:
        content:
@ -1341,7 +1357,8 @@ paths:
      tags:
        - Inspect
      parameters: []
-jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
+jsonSchemaDialect: >-
+  https://json-schema.org/draft/2020-12/schema
 components:
  schemas:
    AppendRowsRequest:
@ -1393,6 +1410,27 @@ components:
        - content
        - stop_reason
        - tool_calls
+    GrammarResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          const: grammar
+          default: grammar
+        bnf:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - type
+        - bnf
    GreedySamplingStrategy:
      type: object
      properties:
@ -1439,6 +1477,27 @@ components:
        mapping:
          image: '#/components/schemas/ImageContentItem'
          text: '#/components/schemas/TextContentItem'
+    JsonSchemaResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          const: json_schema
+          default: json_schema
+        json_schema:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - type
+        - json_schema
    Message:
      oneOf:
        - $ref: '#/components/schemas/UserMessage'
@ -1452,6 +1511,15 @@ components:
          system: '#/components/schemas/SystemMessage'
          tool: '#/components/schemas/ToolResponseMessage'
          assistant: '#/components/schemas/CompletionMessage'
+    ResponseFormat:
+      oneOf:
+        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+        - $ref: '#/components/schemas/GrammarResponseFormat'
+      discriminator:
+        propertyName: type
+        mapping:
+          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+          grammar: '#/components/schemas/GrammarResponseFormat'
    SamplingParams:
      type: object
      properties:
@ -1594,16 +1662,28 @@ components:
        - json
        - function_tag
        - python_list
-      title: This Enum refers to the prompt format for calling custom / zero shot
-        tools
-      description: "`json` --\n    Refers to the json format for calling tools.\n\
-        \    The json format takes the form like\n    {\n        \"type\": \"function\"\
-        ,\n        \"function\" : {\n            \"name\": \"function_name\",\n  \
-        \          \"description\": \"function_description\",\n            \"parameters\"\
-        : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of
-        how you could define\n    your own user defined format for making tool calls.\n\
-        \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
-        \nThe detailed prompts for each of these formats are added to llama cli"
+      title: >-
+        This Enum refers to the prompt format for calling custom / zero shot tools
+      description: >-
+        `json` --
+            Refers to the json format for calling tools.
+            The json format takes the form like
+            {
+                "type": "function",
+                "function" : {
+                    "name": "function_name",
+                    "description": "function_description",
+                    "parameters": {...}
+                }
+            }
+
+        `function_tag` --
+            This is an example of how you could define
+            your own user defined format for making tool calls.
+            The function_tag format looks like this,
+            <function=function_name>(parameters)</function>
+
+        The detailed prompts for each of these formats are added to llama cli
    ToolResponseMessage:
      type: object
      properties:
@ -1697,6 +1777,8 @@ components:
          $ref: '#/components/schemas/ToolChoice'
        tool_prompt_format:
          $ref: '#/components/schemas/ToolPromptFormat'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
        logprobs:
          type: object
          properties:
@ -1711,13 +1793,35 @@ components:
    BatchChatCompletionResponse:
      type: object
      properties:
-        completion_message_batch:
+        batch:
          type: array
          items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/ChatCompletionResponse'
      additionalProperties: false
      required:
-        - completion_message_batch
+        - batch
+    ChatCompletionResponse:
+      type: object
+      properties:
+        completion_message:
+          $ref: '#/components/schemas/CompletionMessage'
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+      additionalProperties: false
+      required:
+        - completion_message
+    TokenLogProbs:
+      type: object
+      properties:
+        logprobs_by_token:
+          type: object
+          additionalProperties:
+            type: number
+      additionalProperties: false
+      required:
+        - logprobs_by_token
    BatchCompletionRequest:
      type: object
      properties:
@ -1729,6 +1833,8 @@ components:
            $ref: '#/components/schemas/InterleavedContent'
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
        logprobs:
          type: object
          properties:
@ -1743,13 +1849,29 @@ components:
    BatchCompletionResponse:
      type: object
      properties:
-        completion_message_batch:
+        batch:
          type: array
          items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/CompletionResponse'
      additionalProperties: false
      required:
-        - completion_message_batch
+        - batch
+    CompletionResponse:
+      type: object
+      properties:
+        content:
+          type: string
+        stop_reason:
+          $ref: '#/components/schemas/StopReason'
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+      additionalProperties: false
+      required:
+        - content
+        - stop_reason
+      title: Completion response.
    CancelTrainingJobRequest:
      type: object
      properties:
@ -1758,80 +1880,45 @@ components:
      additionalProperties: false
      required:
        - job_uuid
-    GrammarResponseFormat:
-      type: object
-      properties:
-        type:
-          type: string
-          const: grammar
-          default: grammar
-        bnf:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - type
-        - bnf
-    JsonSchemaResponseFormat:
-      type: object
-      properties:
-        type:
-          type: string
-          const: json_schema
-          default: json_schema
-        json_schema:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - type
-        - json_schema
-    ResponseFormat:
-      oneOf:
-        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
-        - $ref: '#/components/schemas/GrammarResponseFormat'
-      discriminator:
-        propertyName: type
-        mapping:
-          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
-          grammar: '#/components/schemas/GrammarResponseFormat'
    ChatCompletionRequest:
      type: object
      properties:
        model_id:
          type: string
+          description: The identifier of the model to use
        messages:
          type: array
          items:
            $ref: '#/components/schemas/Message'
+          description: List of messages in the conversation
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            Parameters to control the sampling strategy
        tools:
          type: array
          items:
            $ref: '#/components/schemas/ToolDefinition'
+          description: >-
+            (Optional) List of tool definitions available to the model
        tool_choice:
          $ref: '#/components/schemas/ToolChoice'
+          description: >-
+            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
        tool_prompt_format:
          $ref: '#/components/schemas/ToolPromptFormat'
+          description: >-
+            (Optional) Specifies how tool definitions are formatted when presenting
+            to the model
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
        stream:
          type: boolean
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
        logprobs:
          type: object
          properties:
@ -1839,23 +1926,13 @@ components:
              type: integer
              default: 0
          additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
      additionalProperties: false
      required:
        - model_id
        - messages
-    ChatCompletionResponse:
-      type: object
-      properties:
-        completion_message:
-          $ref: '#/components/schemas/CompletionMessage'
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-      additionalProperties: false
-      required:
-        - completion_message
-      title: Chat completion response.
    ChatCompletionResponseEvent:
      type: object
      properties:
@ -1888,7 +1965,6 @@ components:
      additionalProperties: false
      required:
        - event
-      title: SSE-stream of these events.
    ContentDelta:
      oneOf:
        - $ref: '#/components/schemas/TextDelta'
@ -1927,16 +2003,6 @@ components:
      required:
        - type
        - text
-    TokenLogProbs:
-      type: object
-      properties:
-        logprobs_by_token:
-          type: object
-          additionalProperties:
-            type: number
-      additionalProperties: false
-      required:
-        - logprobs_by_token
    ToolCallDelta:
      type: object
      properties:
@ -1967,14 +2033,23 @@ components:
      properties:
        model_id:
          type: string
+          description: The identifier of the model to use
        content:
          $ref: '#/components/schemas/InterleavedContent'
+          description: The content to generate a completion for
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            (Optional) Parameters to control the sampling strategy
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
        stream:
          type: boolean
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
        logprobs:
          type: object
          properties:
@ -1982,26 +2057,13 @@ components:
              type: integer
              default: 0
          additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
      additionalProperties: false
      required:
        - model_id
        - content
-    CompletionResponse:
-      type: object
-      properties:
-        content:
-          type: string
-        stop_reason:
-          $ref: '#/components/schemas/StopReason'
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: Completion response.
    CompletionResponseStreamChunk:
      type: object
      properties:
@ -2558,7 +2620,8 @@ components:
        - output_message
        - output_attachments
        - started_at
-      title: A single turn in an interaction with an Agentic System.
+      title: >-
+        A single turn in an interaction with an Agentic System.
    ViolationLevel:
      type: string
      enum:
@ -2570,10 +2633,14 @@ components:
      properties:
        model_id:
          type: string
+          description: The identifier of the model to use
        contents:
          type: array
          items:
            $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            List of contents to generate embeddings for. Note that content can be
+            multimodal.
      additionalProperties: false
      required:
        - model_id
@ -2845,7 +2912,8 @@ components:
        - session_name
        - turns
        - started_at
-      title: A single session of an interaction with an Agentic System.
+      title: >-
+        A single session of an interaction with an Agentic System.
    AgentStepResponse:
      type: object
      properties:
@ -3194,7 +3262,8 @@ components:
        - provider_resource_id
        - provider_id
        - type
-      title: A safety shield resource that can be used to check content
+      title: >-
+        A safety shield resource that can be used to check content
    Span:
      type: object
      properties:
@ -4684,8 +4753,9 @@ components:
      additionalProperties: false
      required:
        - synthetic_data
-      title: Response from the synthetic data generation. Batch of (prompt, response,
-        score) tuples that pass the threshold.
+      title: >-
+        Response from the synthetic data generation. Batch of (prompt, response, score)
+        tuples that pass the threshold.
    VersionInfo:
      type: object
      properties:
@ -4763,13 +4833,13 @@ tags:
  - name: ChatCompletionRequest
    description: ''
  - name: ChatCompletionResponse
-    description: Chat completion response.
+    description: ''
  - name: ChatCompletionResponseEvent
    description: Chat completion response event.
  - name: ChatCompletionResponseEventType
    description: ''
  - name: ChatCompletionResponseStreamChunk
-    description: SSE-stream of these events.
+    description: ''
  - name: Checkpoint
    description: Checkpoint created during training runs
  - name: CompletionInputType
@ -4998,9 +5068,11 @@ tags:
  - name: ScoringResult
    description: ''
  - name: Session
-    description: A single session of an interaction with an Agentic System.
+    description: >-
+      A single session of an interaction with an Agentic System.
  - name: Shield
-    description: A safety shield resource that can be used to check content
+    description: >-
+      A safety shield resource that can be used to check content
  - name: ShieldCallStep
    description: ''
  - name: Shields
@ -5028,8 +5100,9 @@ tags:
    description: ''
  - name: SyntheticDataGeneration (Coming Soon)
  - name: SyntheticDataGenerationResponse
-    description: Response from the synthetic data generation. Batch of (prompt, response,
-      score) tuples that pass the threshold.
+    description: >-
+      Response from the synthetic data generation. Batch of (prompt, response, score)
+      tuples that pass the threshold.
  - name: SystemMessage
    description: ''
  - name: Telemetry
@ -5067,15 +5140,29 @@ tags:
  - name: ToolParameter
    description: ''
  - name: ToolPromptFormat
-    description: "This Enum refers to the prompt format for calling custom / zero
-      shot tools\n\n`json` --\n    Refers to the json format for calling tools.\n\
-      \    The json format takes the form like\n    {\n        \"type\": \"function\"\
-      ,\n        \"function\" : {\n            \"name\": \"function_name\",\n    \
-      \        \"description\": \"function_description\",\n            \"parameters\"\
-      : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of how
-      you could define\n    your own user defined format for making tool calls.\n\
-      \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
-      \nThe detailed prompts for each of these formats are added to llama cli"
+    description: >-
+      This Enum refers to the prompt format for calling custom / zero shot tools
+
+
+      `json` --
+          Refers to the json format for calling tools.
+          The json format takes the form like
+          {
+              "type": "function",
+              "function" : {
+                  "name": "function_name",
+                  "description": "function_description",
+                  "parameters": {...}
+              }
+          }
+
+      `function_tag` --
+          This is an example of how you could define
+          your own user defined format for making tool calls.
+          The function_tag format looks like this,
+          <function=function_name>(parameters)</function>
+
+      The detailed prompts for each of these formats are added to llama cli
  - name: ToolResponse
    description: ''
  - name: ToolResponseMessage
@ -5090,7 +5177,8 @@ tags:
  - name: TrainingConfig
    description: ''
  - name: Turn
-    description: A single turn in an interaction with an Agentic System.
+    description: >-
+      A single turn in an interaction with an Agentic System.
  - name: URL
    description: ''
  - name: UnionType
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -7,13 +7,15 @@
 from typing import List, Optional, Protocol, runtime_checkable

 from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel

 from llama_stack.apis.inference import (
-    CompletionMessage,
+    ChatCompletionResponse,
+    CompletionResponse,
    InterleavedContent,
    LogProbConfig,
    Message,
+    ResponseFormat,
    SamplingParams,
    ToolChoice,
    ToolDefinition,
@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
 )


-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    logprobs: Optional[LogProbConfig] = None
-
-
@json_schema_type
 class BatchCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
-
-
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
+    batch: List[CompletionResponse]


@json_schema_type
 class BatchChatCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
+    batch: List[ChatCompletionResponse]


@runtime_checkable
@ -60,6 +41,7 @@ class BatchInference(Protocol):
        model: str,
        content_batch: List[InterleavedContent],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchCompletionResponse: ...

@ -73,5 +55,6 @@ class BatchInference(Protocol):
        tools: Optional[List[ToolDefinition]] = list,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        response_format: Optional[ResponseFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchChatCompletionResponse: ...
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -186,7 +186,6 @@ ResponseFormat = register_schema(
 )


-@json_schema_type
 class CompletionRequest(BaseModel):
    model: str
    content: InterleavedContent
@ -215,23 +214,6 @@ class CompletionResponseStreamChunk(BaseModel):
    logprobs: Optional[List[TokenLogProbs]] = None


-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    response_format: Optional[ResponseFormat] = None
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Batch completion response."""
-
-    batch: List[CompletionResponse]
-
-
-@json_schema_type
 class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[Message]
@ -249,37 +231,15 @@ class ChatCompletionRequest(BaseModel):

@json_schema_type
 class ChatCompletionResponseStreamChunk(BaseModel):
-    """SSE-stream of these events."""
-
    event: ChatCompletionResponseEvent


@json_schema_type
 class ChatCompletionResponse(BaseModel):
-    """Chat completion response."""
-
    completion_message: CompletionMessage
    logprobs: Optional[List[TokenLogProbs]] = None


-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
-
-
@json_schema_type
 class EmbeddingsResponse(BaseModel):
    embeddings: List[List[float]]
@ -303,7 +263,19 @@ class Inference(Protocol):
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        """Generate a completion for the given content using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param content: The content to generate a completion for
+        :param sampling_params: (Optional) Parameters to control the sampling strategy
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a CompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+        """
+        ...

    @webmethod(route="/inference/chat-completion", method="POST")
    async def chat_completion(
@ -311,7 +283,6 @@ class Inference(Protocol):
        model_id: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        # zero-shot tool definitions as input to the model
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = None,
@ -320,11 +291,33 @@ class Inference(Protocol):
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[
        ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
-    ]: ...
+    ]:
+        """Generate a chat completion for the given messages using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param messages: List of messages in the conversation
+        :param sampling_params: Parameters to control the sampling strategy
+        :param tools: (Optional) List of tool definitions available to the model
+        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+        :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+        """
+        ...

    @webmethod(route="/inference/embeddings", method="POST")
    async def embeddings(
        self,
        model_id: str,
        contents: List[InterleavedContent],
-    ) -> EmbeddingsResponse: ...
+    ) -> EmbeddingsResponse:
+        """Generate embeddings for content pieces using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param contents: List of contents to generate embeddings for. Note that content can be multimodal.
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats.
+        """
+        ...