From ebfa8ad4fbdcdee4c452667f92bb723d8e151324 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 28 Jan 2025 12:27:21 -0800
Subject: [PATCH] Update OpenAPI generator to add param and field documentation

---
 docs/openapi_generator/generate.py            |  13 +-
 docs/openapi_generator/pyopenapi/generator.py |  34 +-
 .../openapi_generator/strong_typing/schema.py |   1 +
 docs/resources/llama-stack-spec.html          | 392 ++++++++++--------
 docs/resources/llama-stack-spec.yaml          | 368 +++++++++-------
 .../apis/batch_inference/batch_inference.py   |  33 +-
 llama_stack/apis/inference/inference.py       |  81 ++--
 7 files changed, 525 insertions(+), 397 deletions(-)

diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 1a59369cb..48109e5d8 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server  # noqa: E402
 from .pyopenapi.utility import Specification  # noqa: E402
 
 
+def str_presenter(dumper, data):
+    if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
+        "#/components/schemas/"
+    ):
+        style = None
+    else:
+        style = ">" if "\n" in data or len(data) > 40 else None
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
+
+
 def main(output_dir: str):
     output_dir = Path(output_dir)
     if not output_dir.exists():
@@ -69,7 +79,8 @@ def main(output_dir: str):
         y.sequence_dash_offset = 2
         y.width = 80
         y.allow_unicode = True
-        y.explicit_start = True
+        y.representer.add_representer(str, str_presenter)
+
         y.dump(
             spec.get_json(),
             fp,
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 317b895b5..d8e0d81ed 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -8,6 +8,7 @@ import collections
 import hashlib
 import ipaddress
 import typing
+from dataclasses import field, make_dataclass
 from typing import Any, Dict, Set, Union
 
 from ..strong_typing.core import JsonType
@@ -276,6 +277,20 @@ class StatusResponse:
     examples: List[Any] = dataclasses.field(default_factory=list)
 
 
+def create_docstring_for_request(
+    request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
+) -> str:
+    """Creates a ReST-style docstring for a dynamically generated request dataclass."""
+    lines = ["\n"]  # Short description
+
+    # Add parameter documentation in ReST format
+    for name, type_ in fields:
+        desc = doc_params.get(name, "")
+        lines.append(f":param {name}: {desc}")
+
+    return "\n".join(lines)
+
+
 class ResponseBuilder:
     content_builder: ContentBuilder
 
@@ -493,11 +508,24 @@ class Generator:
             first = next(iter(op.request_params))
             request_name, request_type = first
 
-            from dataclasses import make_dataclass
-
             op_name = "".join(word.capitalize() for word in op.name.split("_"))
             request_name = f"{op_name}Request"
-            request_type = make_dataclass(request_name, op.request_params)
+            fields = [
+                (
+                    name,
+                    type_,
+                )
+                for name, type_ in op.request_params
+            ]
+            request_type = make_dataclass(
+                request_name,
+                fields,
+                namespace={
+                    "__doc__": create_docstring_for_request(
+                        request_name, fields, doc_params
+                    )
+                },
+            )
 
             requestBody = RequestBody(
                 content={
diff --git a/docs/openapi_generator/strong_typing/schema.py b/docs/openapi_generator/strong_typing/schema.py
index 826efdb4a..f4393041f 100644
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@@ -531,6 +531,7 @@ class JsonSchemaGenerator:
             # add property docstring if available
             property_doc = property_docstrings.get(property_name)
             if property_doc:
+                # print(output_name, property_doc)
                 property_def.pop("title", None)
                 property_def["description"] = property_doc
 
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 5998963d2..b720bef21 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -190,7 +190,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "Chat completion response. **OR** SSE-stream of these events.",
+                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
                         "content": {
                             "text/event-stream": {
                                 "schema": {
@@ -210,6 +210,7 @@
                 "tags": [
                     "Inference"
                 ],
+                "summary": "Generate a chat completion for the given messages using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -227,7 +228,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "Completion response. **OR** streamed completion response.",
+                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
                         "content": {
                             "text/event-stream": {
                                 "schema": {
@@ -247,6 +248,7 @@
                 "tags": [
                     "Inference"
                 ],
+                "summary": "Generate a completion for the given content using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -485,7 +487,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK",
+                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
                         "content": {
                             "application/json": {
                                 "schema": {
@@ -498,6 +500,7 @@
                 "tags": [
                     "Inference"
                 ],
+                "summary": "Generate embeddings for content pieces using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -2372,6 +2375,46 @@
                     "tool_calls"
                 ]
             },
+            "GrammarResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "grammar",
+                        "default": "grammar"
+                    },
+                    "bnf": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "bnf"
+                ]
+            },
             "GreedySamplingStrategy": {
                 "type": "object",
                 "properties": {
@@ -2447,6 +2490,46 @@
                     }
                 }
             },
+            "JsonSchemaResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "json_schema",
+                        "default": "json_schema"
+                    },
+                    "json_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "json_schema"
+                ]
+            },
             "Message": {
                 "oneOf": [
                     {
@@ -2472,6 +2555,23 @@
                     }
                 }
             },
+            "ResponseFormat": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+                    },
+                    {
+                        "$ref": "#/components/schemas/GrammarResponseFormat"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+                        "grammar": "#/components/schemas/GrammarResponseFormat"
+                    }
+                }
+            },
             "SamplingParams": {
                 "type": "object",
                 "properties": {
@@ -2865,6 +2965,9 @@
                     "tool_prompt_format": {
                         "$ref": "#/components/schemas/ToolPromptFormat"
                     },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
                     "logprobs": {
                         "type": "object",
                         "properties": {
@@ -2885,16 +2988,49 @@
             "BatchChatCompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/ChatCompletionResponse"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "completion_message_batch"
+                    "batch"
+                ]
+            },
+            "ChatCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "completion_message": {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "completion_message"
+                ]
+            },
+            "TokenLogProbs": {
+                "type": "object",
+                "properties": {
+                    "logprobs_by_token": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "logprobs_by_token"
                 ]
             },
             "BatchCompletionRequest": {
@@ -2912,6 +3048,9 @@
                     "sampling_params": {
                         "$ref": "#/components/schemas/SamplingParams"
                     },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
                     "logprobs": {
                         "type": "object",
                         "properties": {
@@ -2932,18 +3071,41 @@
             "BatchCompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/CompletionResponse"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "completion_message_batch"
+                    "batch"
                 ]
             },
+            "CompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "content": {
+                        "type": "string"
+                    },
+                    "stop_reason": {
+                        "$ref": "#/components/schemas/StopReason"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "content",
+                    "stop_reason"
+                ],
+                "title": "Completion response."
+            },
             "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
@@ -2956,135 +3118,46 @@
                     "job_uuid"
                 ]
             },
-            "GrammarResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "grammar",
-                        "default": "grammar"
-                    },
-                    "bnf": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "bnf"
-                ]
-            },
-            "JsonSchemaResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "json_schema",
-                        "default": "json_schema"
-                    },
-                    "json_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "json_schema"
-                ]
-            },
-            "ResponseFormat": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
-                    },
-                    {
-                        "$ref": "#/components/schemas/GrammarResponseFormat"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
-                        "grammar": "#/components/schemas/GrammarResponseFormat"
-                    }
-                }
-            },
             "ChatCompletionRequest": {
                 "type": "object",
                 "properties": {
                     "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                     },
                     "messages": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/Message"
-                        }
+                        },
+                        "description": "List of messages in the conversation"
                     },
                     "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "Parameters to control the sampling strategy"
                     },
                     "tools": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/ToolDefinition"
-                        }
+                        },
+                        "description": "(Optional) List of tool definitions available to the model"
                     },
                     "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice"
+                        "$ref": "#/components/schemas/ToolChoice",
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
                     },
                     "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat"
+                        "$ref": "#/components/schemas/ToolPromptFormat",
+                        "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
                     },
                     "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                     },
                     "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                     },
                     "logprobs": {
                         "type": "object",
@@ -3094,7 +3167,8 @@
                                 "default": 0
                             }
                         },
-                        "additionalProperties": false
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     }
                 },
                 "additionalProperties": false,
@@ -3103,25 +3177,6 @@
                     "messages"
                 ]
             },
-            "ChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_message"
-                ],
-                "title": "Chat completion response."
-            },
             "ChatCompletionResponseEvent": {
                 "type": "object",
                 "properties": {
@@ -3166,8 +3221,7 @@
                 "additionalProperties": false,
                 "required": [
                     "event"
-                ],
-                "title": "SSE-stream of these events."
+                ]
             },
             "ContentDelta": {
                 "oneOf": [
@@ -3227,21 +3281,6 @@
                     "text"
                 ]
             },
-            "TokenLogProbs": {
-                "type": "object",
-                "properties": {
-                    "logprobs_by_token": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "number"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "logprobs_by_token"
-                ]
-            },
             "ToolCallDelta": {
                 "type": "object",
                 "properties": {
@@ -3284,19 +3323,24 @@
                 "type": "object",
                 "properties": {
                     "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                     },
                     "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content to generate a completion for"
                     },
                     "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "(Optional) Parameters to control the sampling strategy"
                     },
                     "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                     },
                     "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                     },
                     "logprobs": {
                         "type": "object",
@@ -3306,7 +3350,8 @@
                                 "default": 0
                             }
                         },
-                        "additionalProperties": false
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     }
                 },
                 "additionalProperties": false,
@@ -3315,29 +3360,6 @@
                     "content"
                 ]
             },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "type": "string"
-                    },
-                    "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "Completion response."
-            },
             "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
@@ -4241,13 +4263,15 @@
                 "type": "object",
                 "properties": {
                     "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                     },
                     "contents": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/InterleavedContent"
-                        }
+                        },
+                        "description": "List of contents to generate embeddings for. Note that content can be multimodal."
                     }
                 },
                 "additionalProperties": false,
@@ -7863,7 +7887,7 @@
         },
         {
             "name": "ChatCompletionResponse",
-            "description": "Chat completion response."
+            "description": ""
         },
         {
             "name": "ChatCompletionResponseEvent",
@@ -7875,7 +7899,7 @@
         },
         {
             "name": "ChatCompletionResponseStreamChunk",
-            "description": "SSE-stream of these events."
+            "description": ""
         },
         {
             "name": "Checkpoint",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 1d7c4f113..353d99d00 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1,11 +1,12 @@
----
 openapi: 3.1.0
 info:
   title: Llama Stack Specification
   version: v1
-  description: "This is the specification of the Llama Stack that provides\n     \
-    \           a set of endpoints and their corresponding interfaces that are tailored
-    to\n                best leverage Llama Models."
+  description: >-
+    This is the specification of the Llama Stack that provides
+                    a set of endpoints and their corresponding interfaces that are
+    tailored to
+                    best leverage Llama Models.
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
@@ -108,7 +109,9 @@ paths:
     post:
       responses:
         '200':
-          description: Chat completion response. **OR** SSE-stream of these events.
+          description: >-
+            If stream=False, returns a ChatCompletionResponse with the full completion.
+            If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
           content:
             text/event-stream:
               schema:
@@ -117,6 +120,8 @@ paths:
                   - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
       tags:
         - Inference
+      summary: >-
+        Generate a chat completion for the given messages using the specified model.
       parameters: []
       requestBody:
         content:
@@ -128,7 +133,9 @@ paths:
     post:
       responses:
         '200':
-          description: Completion response. **OR** streamed completion response.
+          description: >-
+            If stream=False, returns a CompletionResponse with the full completion.
+            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
           content:
             text/event-stream:
               schema:
@@ -137,6 +144,8 @@ paths:
                   - $ref: '#/components/schemas/CompletionResponseStreamChunk'
       tags:
         - Inference
+      summary: >-
+        Generate a completion for the given content using the specified model.
       parameters: []
       requestBody:
         content:
@@ -189,8 +198,9 @@ paths:
     post:
       responses:
         '200':
-          description: A single turn in an interaction with an Agentic System. **OR**
-            streamed agent turn completion response.
+          description: >-
+            A single turn in an interaction with an Agentic System. **OR** streamed
+            agent turn completion response.
           content:
             text/event-stream:
               schema:
@@ -279,13 +289,17 @@ paths:
     post:
       responses:
         '200':
-          description: OK
+          description: >-
+            An array of embeddings, one for each content. Each embedding is a list
+            of floats.
           content:
             application/json:
               schema:
                 $ref: '#/components/schemas/EmbeddingsResponse'
       tags:
         - Inference
+      summary: >-
+        Generate embeddings for content pieces using the specified model.
       parameters: []
       requestBody:
         content:
@@ -709,7 +723,8 @@ paths:
           description: OK
       tags:
         - ToolRuntime
-      summary: Index documents so they can be used by the RAG system
+      summary: >-
+        Index documents so they can be used by the RAG system
       parameters: []
       requestBody:
         content:
@@ -1109,7 +1124,8 @@ paths:
                 $ref: '#/components/schemas/RAGQueryResult'
       tags:
         - ToolRuntime
-      summary: Query the RAG system for context; typically invoked by the agent
+      summary: >-
+        Query the RAG system for context; typically invoked by the agent
       parameters: []
       requestBody:
         content:
@@ -1341,7 +1357,8 @@ paths:
       tags:
         - Inspect
       parameters: []
-jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
+jsonSchemaDialect: >-
+  https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
     AppendRowsRequest:
@@ -1393,6 +1410,27 @@ components:
         - content
         - stop_reason
         - tool_calls
+    GrammarResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          const: grammar
+          default: grammar
+        bnf:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - type
+        - bnf
     GreedySamplingStrategy:
       type: object
       properties:
@@ -1439,6 +1477,27 @@ components:
         mapping:
           image: '#/components/schemas/ImageContentItem'
           text: '#/components/schemas/TextContentItem'
+    JsonSchemaResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          const: json_schema
+          default: json_schema
+        json_schema:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - type
+        - json_schema
     Message:
       oneOf:
         - $ref: '#/components/schemas/UserMessage'
@@ -1452,6 +1511,15 @@ components:
           system: '#/components/schemas/SystemMessage'
           tool: '#/components/schemas/ToolResponseMessage'
           assistant: '#/components/schemas/CompletionMessage'
+    ResponseFormat:
+      oneOf:
+        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+        - $ref: '#/components/schemas/GrammarResponseFormat'
+      discriminator:
+        propertyName: type
+        mapping:
+          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+          grammar: '#/components/schemas/GrammarResponseFormat'
     SamplingParams:
       type: object
       properties:
@@ -1594,16 +1662,28 @@ components:
         - json
         - function_tag
         - python_list
-      title: This Enum refers to the prompt format for calling custom / zero shot
-        tools
-      description: "`json` --\n    Refers to the json format for calling tools.\n\
-        \    The json format takes the form like\n    {\n        \"type\": \"function\"\
-        ,\n        \"function\" : {\n            \"name\": \"function_name\",\n  \
-        \          \"description\": \"function_description\",\n            \"parameters\"\
-        : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of
-        how you could define\n    your own user defined format for making tool calls.\n\
-        \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
-        \nThe detailed prompts for each of these formats are added to llama cli"
+      title: >-
+        This Enum refers to the prompt format for calling custom / zero shot tools
+      description: >-
+        `json` --
+            Refers to the json format for calling tools.
+            The json format takes the form like
+            {
+                "type": "function",
+                "function" : {
+                    "name": "function_name",
+                    "description": "function_description",
+                    "parameters": {...}
+                }
+            }
+
+        `function_tag` --
+            This is an example of how you could define
+            your own user defined format for making tool calls.
+            The function_tag format looks like this,
+            <function=function_name>(parameters)</function>
+
+        The detailed prompts for each of these formats are added to llama cli
     ToolResponseMessage:
       type: object
       properties:
@@ -1697,6 +1777,8 @@ components:
           $ref: '#/components/schemas/ToolChoice'
         tool_prompt_format:
           $ref: '#/components/schemas/ToolPromptFormat'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
         logprobs:
           type: object
           properties:
@@ -1711,13 +1793,35 @@ components:
     BatchChatCompletionResponse:
       type: object
       properties:
-        completion_message_batch:
+        batch:
           type: array
           items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/ChatCompletionResponse'
       additionalProperties: false
       required:
-        - completion_message_batch
+        - batch
+    ChatCompletionResponse:
+      type: object
+      properties:
+        completion_message:
+          $ref: '#/components/schemas/CompletionMessage'
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+      additionalProperties: false
+      required:
+        - completion_message
+    TokenLogProbs:
+      type: object
+      properties:
+        logprobs_by_token:
+          type: object
+          additionalProperties:
+            type: number
+      additionalProperties: false
+      required:
+        - logprobs_by_token
     BatchCompletionRequest:
       type: object
       properties:
@@ -1729,6 +1833,8 @@ components:
             $ref: '#/components/schemas/InterleavedContent'
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
         logprobs:
           type: object
           properties:
@@ -1743,13 +1849,29 @@ components:
     BatchCompletionResponse:
       type: object
       properties:
-        completion_message_batch:
+        batch:
           type: array
           items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/CompletionResponse'
       additionalProperties: false
       required:
-        - completion_message_batch
+        - batch
+    CompletionResponse:
+      type: object
+      properties:
+        content:
+          type: string
+        stop_reason:
+          $ref: '#/components/schemas/StopReason'
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+      additionalProperties: false
+      required:
+        - content
+        - stop_reason
+      title: Completion response.
     CancelTrainingJobRequest:
       type: object
       properties:
@@ -1758,80 +1880,45 @@ components:
       additionalProperties: false
       required:
         - job_uuid
-    GrammarResponseFormat:
-      type: object
-      properties:
-        type:
-          type: string
-          const: grammar
-          default: grammar
-        bnf:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - type
-        - bnf
-    JsonSchemaResponseFormat:
-      type: object
-      properties:
-        type:
-          type: string
-          const: json_schema
-          default: json_schema
-        json_schema:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - type
-        - json_schema
-    ResponseFormat:
-      oneOf:
-        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
-        - $ref: '#/components/schemas/GrammarResponseFormat'
-      discriminator:
-        propertyName: type
-        mapping:
-          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
-          grammar: '#/components/schemas/GrammarResponseFormat'
     ChatCompletionRequest:
       type: object
       properties:
         model_id:
           type: string
+          description: The identifier of the model to use
         messages:
           type: array
           items:
             $ref: '#/components/schemas/Message'
+          description: List of messages in the conversation
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            Parameters to control the sampling strategy
         tools:
           type: array
           items:
             $ref: '#/components/schemas/ToolDefinition'
+          description: >-
+            (Optional) List of tool definitions available to the model
         tool_choice:
           $ref: '#/components/schemas/ToolChoice'
+          description: >-
+            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
         tool_prompt_format:
           $ref: '#/components/schemas/ToolPromptFormat'
+          description: >-
+            (Optional) Specifies how tool definitions are formatted when presenting
+            to the model
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
         stream:
           type: boolean
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
         logprobs:
           type: object
           properties:
@@ -1839,23 +1926,13 @@ components:
               type: integer
               default: 0
           additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
       additionalProperties: false
       required:
         - model_id
         - messages
-    ChatCompletionResponse:
-      type: object
-      properties:
-        completion_message:
-          $ref: '#/components/schemas/CompletionMessage'
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-      additionalProperties: false
-      required:
-        - completion_message
-      title: Chat completion response.
     ChatCompletionResponseEvent:
       type: object
       properties:
@@ -1888,7 +1965,6 @@ components:
       additionalProperties: false
       required:
         - event
-      title: SSE-stream of these events.
     ContentDelta:
       oneOf:
         - $ref: '#/components/schemas/TextDelta'
@@ -1927,16 +2003,6 @@ components:
       required:
         - type
         - text
-    TokenLogProbs:
-      type: object
-      properties:
-        logprobs_by_token:
-          type: object
-          additionalProperties:
-            type: number
-      additionalProperties: false
-      required:
-        - logprobs_by_token
     ToolCallDelta:
       type: object
       properties:
@@ -1967,14 +2033,23 @@ components:
       properties:
         model_id:
           type: string
+          description: The identifier of the model to use
         content:
           $ref: '#/components/schemas/InterleavedContent'
+          description: The content to generate a completion for
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            (Optional) Parameters to control the sampling strategy
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
         stream:
           type: boolean
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
         logprobs:
           type: object
           properties:
@@ -1982,26 +2057,13 @@ components:
               type: integer
               default: 0
           additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
       additionalProperties: false
       required:
         - model_id
         - content
-    CompletionResponse:
-      type: object
-      properties:
-        content:
-          type: string
-        stop_reason:
-          $ref: '#/components/schemas/StopReason'
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: Completion response.
     CompletionResponseStreamChunk:
       type: object
       properties:
@@ -2558,7 +2620,8 @@ components:
         - output_message
         - output_attachments
         - started_at
-      title: A single turn in an interaction with an Agentic System.
+      title: >-
+        A single turn in an interaction with an Agentic System.
     ViolationLevel:
       type: string
       enum:
@@ -2570,10 +2633,14 @@ components:
       properties:
         model_id:
           type: string
+          description: The identifier of the model to use
         contents:
           type: array
           items:
             $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            List of contents to generate embeddings for. Note that content can be
+            multimodal.
       additionalProperties: false
       required:
         - model_id
@@ -2845,7 +2912,8 @@ components:
         - session_name
         - turns
         - started_at
-      title: A single session of an interaction with an Agentic System.
+      title: >-
+        A single session of an interaction with an Agentic System.
     AgentStepResponse:
       type: object
       properties:
@@ -3194,7 +3262,8 @@ components:
         - provider_resource_id
         - provider_id
         - type
-      title: A safety shield resource that can be used to check content
+      title: >-
+        A safety shield resource that can be used to check content
     Span:
       type: object
       properties:
@@ -4684,8 +4753,9 @@ components:
       additionalProperties: false
       required:
         - synthetic_data
-      title: Response from the synthetic data generation. Batch of (prompt, response,
-        score) tuples that pass the threshold.
+      title: >-
+        Response from the synthetic data generation. Batch of (prompt, response, score)
+        tuples that pass the threshold.
     VersionInfo:
       type: object
       properties:
@@ -4763,13 +4833,13 @@ tags:
   - name: ChatCompletionRequest
     description: ''
   - name: ChatCompletionResponse
-    description: Chat completion response.
+    description: ''
   - name: ChatCompletionResponseEvent
     description: Chat completion response event.
   - name: ChatCompletionResponseEventType
     description: ''
   - name: ChatCompletionResponseStreamChunk
-    description: SSE-stream of these events.
+    description: ''
   - name: Checkpoint
     description: Checkpoint created during training runs
   - name: CompletionInputType
@@ -4998,9 +5068,11 @@ tags:
   - name: ScoringResult
     description: ''
   - name: Session
-    description: A single session of an interaction with an Agentic System.
+    description: >-
+      A single session of an interaction with an Agentic System.
   - name: Shield
-    description: A safety shield resource that can be used to check content
+    description: >-
+      A safety shield resource that can be used to check content
   - name: ShieldCallStep
     description: ''
   - name: Shields
@@ -5028,8 +5100,9 @@ tags:
     description: ''
   - name: SyntheticDataGeneration (Coming Soon)
   - name: SyntheticDataGenerationResponse
-    description: Response from the synthetic data generation. Batch of (prompt, response,
-      score) tuples that pass the threshold.
+    description: >-
+      Response from the synthetic data generation. Batch of (prompt, response, score)
+      tuples that pass the threshold.
   - name: SystemMessage
     description: ''
   - name: Telemetry
@@ -5067,15 +5140,29 @@ tags:
   - name: ToolParameter
     description: ''
   - name: ToolPromptFormat
-    description: "This Enum refers to the prompt format for calling custom / zero
-      shot tools\n\n`json` --\n    Refers to the json format for calling tools.\n\
-      \    The json format takes the form like\n    {\n        \"type\": \"function\"\
-      ,\n        \"function\" : {\n            \"name\": \"function_name\",\n    \
-      \        \"description\": \"function_description\",\n            \"parameters\"\
-      : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of how
-      you could define\n    your own user defined format for making tool calls.\n\
-      \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
-      \nThe detailed prompts for each of these formats are added to llama cli"
+    description: >-
+      This Enum refers to the prompt format for calling custom / zero shot tools
+
+
+      `json` --
+          Refers to the json format for calling tools.
+          The json format takes the form like
+          {
+              "type": "function",
+              "function" : {
+                  "name": "function_name",
+                  "description": "function_description",
+                  "parameters": {...}
+              }
+          }
+
+      `function_tag` --
+          This is an example of how you could define
+          your own user defined format for making tool calls.
+          The function_tag format looks like this,
+          <function=function_name>(parameters)</function>
+
+      The detailed prompts for each of these formats are added to llama cli
   - name: ToolResponse
     description: ''
   - name: ToolResponseMessage
@@ -5090,7 +5177,8 @@ tags:
   - name: TrainingConfig
     description: ''
   - name: Turn
-    description: A single turn in an interaction with an Agentic System.
+    description: >-
+      A single turn in an interaction with an Agentic System.
   - name: URL
     description: ''
   - name: UnionType
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index ca5ba059f..413c81c5a 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -7,13 +7,15 @@
 from typing import List, Optional, Protocol, runtime_checkable
 
 from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 
 from llama_stack.apis.inference import (
-    CompletionMessage,
+    ChatCompletionResponse,
+    CompletionResponse,
     InterleavedContent,
     LogProbConfig,
     Message,
+    ResponseFormat,
     SamplingParams,
     ToolChoice,
     ToolDefinition,
@@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
 )
 
 
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    logprobs: Optional[LogProbConfig] = None
-
-
 @json_schema_type
 class BatchCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
-
-
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
+    batch: List[CompletionResponse]
 
 
 @json_schema_type
 class BatchChatCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
+    batch: List[ChatCompletionResponse]
 
 
 @runtime_checkable
@@ -60,6 +41,7 @@ class BatchInference(Protocol):
         model: str,
         content_batch: List[InterleavedContent],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
         logprobs: Optional[LogProbConfig] = None,
     ) -> BatchCompletionResponse: ...
 
@@ -73,5 +55,6 @@ class BatchInference(Protocol):
         tools: Optional[List[ToolDefinition]] = list,
         tool_choice: Optional[ToolChoice] = ToolChoice.auto,
         tool_prompt_format: Optional[ToolPromptFormat] = None,
+        response_format: Optional[ResponseFormat] = None,
         logprobs: Optional[LogProbConfig] = None,
     ) -> BatchChatCompletionResponse: ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 871f1f633..36f385eb2 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -186,7 +186,6 @@ ResponseFormat = register_schema(
 )
 
 
-@json_schema_type
 class CompletionRequest(BaseModel):
     model: str
     content: InterleavedContent
@@ -215,23 +214,6 @@ class CompletionResponseStreamChunk(BaseModel):
     logprobs: Optional[List[TokenLogProbs]] = None
 
 
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    response_format: Optional[ResponseFormat] = None
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Batch completion response."""
-
-    batch: List[CompletionResponse]
-
-
-@json_schema_type
 class ChatCompletionRequest(BaseModel):
     model: str
     messages: List[Message]
@@ -249,37 +231,15 @@ class ChatCompletionRequest(BaseModel):
 
 @json_schema_type
 class ChatCompletionResponseStreamChunk(BaseModel):
-    """SSE-stream of these events."""
-
     event: ChatCompletionResponseEvent
 
 
 @json_schema_type
 class ChatCompletionResponse(BaseModel):
-    """Chat completion response."""
-
     completion_message: CompletionMessage
     logprobs: Optional[List[TokenLogProbs]] = None
 
 
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
-
-
 @json_schema_type
 class EmbeddingsResponse(BaseModel):
     embeddings: List[List[float]]
@@ -303,7 +263,19 @@ class Inference(Protocol):
         response_format: Optional[ResponseFormat] = None,
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        """Generate a completion for the given content using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param content: The content to generate a completion for
+        :param sampling_params: (Optional) Parameters to control the sampling strategy
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a CompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+        """
+        ...
 
     @webmethod(route="/inference/chat-completion", method="POST")
     async def chat_completion(
@@ -311,7 +283,6 @@ class Inference(Protocol):
         model_id: str,
         messages: List[Message],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
-        # zero-shot tool definitions as input to the model
         tools: Optional[List[ToolDefinition]] = None,
         tool_choice: Optional[ToolChoice] = ToolChoice.auto,
         tool_prompt_format: Optional[ToolPromptFormat] = None,
@@ -320,11 +291,33 @@ class Inference(Protocol):
         logprobs: Optional[LogProbConfig] = None,
     ) -> Union[
         ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
-    ]: ...
+    ]:
+        """Generate a chat completion for the given messages using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param messages: List of messages in the conversation
+        :param sampling_params: Parameters to control the sampling strategy
+        :param tools: (Optional) List of tool definitions available to the model
+        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+        :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+        """
+        ...
 
     @webmethod(route="/inference/embeddings", method="POST")
     async def embeddings(
         self,
         model_id: str,
         contents: List[InterleavedContent],
-    ) -> EmbeddingsResponse: ...
+    ) -> EmbeddingsResponse:
+        """Generate embeddings for content pieces using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param contents: List of contents to generate embeddings for. Note that content can be multimodal.
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats.
+        """
+        ...