chore(api): remove batch inference (#3261)

# What does this PR do? APIs removed: - POST /v1/batch-inference/completion - POST /v1/batch-inference/chat-completion - POST /v1/inference/batch-completion - POST /v1/inference/batch-chat-completion note - - batch-completion & batch-chat-completion were only implemented for inference=inline::meta-reference - batch-inference were not implemented
2025-12-08 03:00:56 +00:00 · 2025-09-26 17:35:34 -04:00 · 2025-09-26 17:35:34 -04:00 · 60484c5c4e
commit 60484c5c4e
parent b48d5cfed7
12 changed files with 190 additions and 979 deletions
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@ -139,18 +139,7 @@ Methods:
 - <code title="post /v1/agents/{agent_id}/session/{session_id}/turn">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">create</a>(session_id, \*, agent_id, \*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_response.py">TurnCreateResponse</a></code>
 - <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">retrieve</a>(turn_id, \*, agent_id, session_id) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn.py">Turn</a></code>

-## BatchInference

-Types:
-
-```python
-from llama_stack_client.types import BatchInferenceChatCompletionResponse
-```
-
-Methods:
-
- <code title="post /v1/batch-inference/chat-completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_response.py">BatchInferenceChatCompletionResponse</a></code>
- <code title="post /v1/batch-inference/completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>

 ## Datasets

--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -548,7 +548,6 @@ class Generator:
        if op.defining_class.__name__ in [
            "SyntheticDataGeneration",
            "PostTraining",
-            "BatchInference",
        ]:
            op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
            print(op.defining_class.__name__)
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -87,94 +87,6 @@
                }
            }
        },
-        "/v1/inference/batch-chat-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A BatchChatCompletionResponse with the full completions.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate chat completions for a batch of messages using the specified model.",
-                "description": "Generate chat completions for a batch of messages using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/v1/inference/batch-completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A BatchCompletionResponse with the full completions.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/BatchCompletionResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate completions for a batch of content using the specified model.",
-                "description": "Generate completions for a batch of content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/BatchCompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1alpha/post-training/job/cancel": {
            "post": {
                "responses": {
@ -281,7 +193,7 @@
                    }
                },
                "tags": [
-                    "BatchInference (Coming Soon)"
+                    "Inference"
                ],
                "summary": "Generate a chat completion for the given messages using the specified model.",
                "description": "Generate a chat completion for the given messages using the specified model.",
@ -330,7 +242,7 @@
                    }
                },
                "tags": [
-                    "BatchInference (Coming Soon)"
+                    "Inference"
                ],
                "summary": "Generate a completion for the given content using the specified model.",
                "description": "Generate a completion for the given content using the specified model.",
@ -6346,6 +6258,20 @@
                ],
                "title": "AppendRowsRequest"
            },
+            "CancelTrainingJobRequest": {
+                "type": "object",
+                "properties": {
+                    "job_uuid": {
+                        "type": "string",
+                        "description": "The UUID of the job to cancel."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "job_uuid"
+                ],
+                "title": "CancelTrainingJobRequest"
+            },
            "CompletionMessage": {
                "type": "object",
                "properties": {
@ -7051,26 +6977,23 @@
                "title": "UserMessage",
                "description": "A message from the user in a chat conversation."
            },
-            "BatchChatCompletionRequest": {
+            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
                    "model_id": {
                        "type": "string",
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
-                    "messages_batch": {
+                    "messages": {
                        "type": "array",
                        "items": {
-                            "type": "array",
-                            "items": {
-                                "$ref": "#/components/schemas/Message"
-                            }
+                            "$ref": "#/components/schemas/Message"
                        },
-                        "description": "The messages to generate completions for."
+                        "description": "List of messages in the conversation."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
+                        "description": "Parameters to control the sampling strategy."
                    },
                    "tools": {
                        "type": "array",
@ -7079,13 +7002,31 @@
                        },
                        "description": "(Optional) List of tool definitions available to the model."
                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required",
+                            "none"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
+                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
+                    },
+                    "stream": {
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                    },
                    "logprobs": {
                        "type": "object",
@ -7098,32 +7039,18 @@
                        },
                        "additionalProperties": false,
                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig",
+                        "description": "(Optional) Configuration for tool use."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "model_id",
-                    "messages_batch"
+                    "messages"
                ],
-                "title": "BatchChatCompletionRequest"
-            },
-            "BatchChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ChatCompletionResponse"
-                        },
-                        "description": "List of chat completion responses, one for each conversation in the batch"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "batch"
-                ],
-                "title": "BatchChatCompletionResponse",
-                "description": "Response from a batch chat completion request."
+                "title": "ChatCompletionRequest"
            },
            "ChatCompletionResponse": {
                "type": "object",
@ -7203,194 +7130,6 @@
                "title": "TokenLogProbs",
                "description": "Log probabilities for generated tokens."
            },
-            "BatchCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content_batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/InterleavedContent"
-                        },
-                        "description": "The content to generate completions for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content_batch"
-                ],
-                "title": "BatchCompletionRequest"
-            },
-            "BatchCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "batch": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/CompletionResponse"
-                        },
-                        "description": "List of completion responses, one for each input in the batch"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "batch"
-                ],
-                "title": "BatchCompletionResponse",
-                "description": "Response from a batch completion request."
-            },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "content": {
-                        "type": "string",
-                        "description": "The generated completion text"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why generation stopped"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "CompletionResponse",
-                "description": "Response from a completion request."
-            },
-            "CancelTrainingJobRequest": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string",
-                        "description": "The UUID of the job to cancel."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ],
-                "title": "CancelTrainingJobRequest"
-            },
-            "ChatCompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "messages": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/Message"
-                        },
-                        "description": "List of messages in the conversation."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "Parameters to control the sampling strategy."
-                    },
-                    "tools": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolDefinition"
-                        },
-                        "description": "(Optional) List of tool definitions available to the model."
-                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required",
-                            "none"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig",
-                        "description": "(Optional) Configuration for tool use."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "messages"
-                ],
-                "title": "ChatCompletionRequest"
-            },
            "ChatCompletionResponseEvent": {
                "type": "object",
                "properties": {
@ -7603,6 +7342,45 @@
                ],
                "title": "CompletionRequest"
            },
+            "CompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        },
+                        "description": "(Optional) List of metrics associated with the API response"
+                    },
+                    "content": {
+                        "type": "string",
+                        "description": "The generated completion text"
+                    },
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Reason why generation stopped"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
+                        },
+                        "description": "Optional log probabilities for generated tokens"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "content",
+                    "stop_reason"
+                ],
+                "title": "CompletionResponse",
+                "description": "Response from a completion request."
+            },
            "CompletionResponseStreamChunk": {
                "type": "object",
                "properties": {
@ -18779,11 +18557,6 @@
            "description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
            "x-displayName": "Agents API for creating and interacting with agentic systems."
        },
-        {
-            "name": "BatchInference (Coming Soon)",
-            "description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
-            "x-displayName": "Batch inference API for generating completions and chat completions."
-        },
        {
            "name": "Benchmarks"
        },
@ -18858,7 +18631,6 @@
            "name": "Operations",
            "tags": [
                "Agents",
-                "BatchInference (Coming Soon)",
                "Benchmarks",
                "DatasetIO",
                "Datasets",
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -43,72 +43,6 @@ paths:
            schema:
              $ref: '#/components/schemas/AppendRowsRequest'
        required: true
-  /v1/inference/batch-chat-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            A BatchChatCompletionResponse with the full completions.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/BatchChatCompletionResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate chat completions for a batch of messages using the specified model.
-      description: >-
-        Generate chat completions for a batch of messages using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/BatchChatCompletionRequest'
-        required: true
-  /v1/inference/batch-completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            A BatchCompletionResponse with the full completions.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/BatchCompletionResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate completions for a batch of content using the specified model.
-      description: >-
-        Generate completions for a batch of content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/BatchCompletionRequest'
-        required: true
  /v1alpha/post-training/job/cancel:
    post:
      responses:
@ -186,7 +120,7 @@ paths:
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
-        - BatchInference (Coming Soon)
+        - Inference
      summary: >-
        Generate a chat completion for the given messages using the specified model.
      description: >-
@ -223,7 +157,7 @@ paths:
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
-        - BatchInference (Coming Soon)
+        - Inference
      summary: >-
        Generate a completion for the given content using the specified model.
      description: >-
@ -4559,6 +4493,16 @@ components:
      required:
        - rows
      title: AppendRowsRequest
+    CancelTrainingJobRequest:
+      type: object
+      properties:
+        job_uuid:
+          type: string
+          description: The UUID of the job to cancel.
+      additionalProperties: false
+      required:
+        - job_uuid
+      title: CancelTrainingJobRequest
    CompletionMessage:
      type: object
      properties:
@ -5076,224 +5020,6 @@ components:
      title: UserMessage
      description: >-
        A message from the user in a chat conversation.
-    BatchChatCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        messages_batch:
-          type: array
-          items:
-            type: array
-            items:
-              $ref: '#/components/schemas/Message'
-          description: >-
-            The messages to generate completions for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDefinition'
-          description: >-
-            (Optional) List of tool definitions available to the model.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-          description: (Optional) Configuration for tool use.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - messages_batch
-      title: BatchChatCompletionRequest
-    BatchChatCompletionResponse:
-      type: object
-      properties:
-        batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/ChatCompletionResponse'
-          description: >-
-            List of chat completion responses, one for each conversation in the batch
-      additionalProperties: false
-      required:
-        - batch
-      title: BatchChatCompletionResponse
-      description: >-
-        Response from a batch chat completion request.
-    ChatCompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        completion_message:
-          $ref: '#/components/schemas/CompletionMessage'
-          description: The complete response message
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - completion_message
-      title: ChatCompletionResponse
-      description: Response from a chat completion request.
-    MetricInResponse:
-      type: object
-      properties:
-        metric:
-          type: string
-          description: The name of the metric
-        value:
-          oneOf:
-            - type: integer
-            - type: number
-          description: The numeric value of the metric
-        unit:
-          type: string
-          description: >-
-            (Optional) The unit of measurement for the metric value
-      additionalProperties: false
-      required:
-        - metric
-        - value
-      title: MetricInResponse
-      description: >-
-        A metric value included in API responses.
-    TokenLogProbs:
-      type: object
-      properties:
-        logprobs_by_token:
-          type: object
-          additionalProperties:
-            type: number
-          description: >-
-            Dictionary mapping tokens to their log probabilities
-      additionalProperties: false
-      required:
-        - logprobs_by_token
-      title: TokenLogProbs
-      description: Log probabilities for generated tokens.
-    BatchCompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content_batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/InterleavedContent'
-          description: The content to generate completions for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content_batch
-      title: BatchCompletionRequest
-    BatchCompletionResponse:
-      type: object
-      properties:
-        batch:
-          type: array
-          items:
-            $ref: '#/components/schemas/CompletionResponse'
-          description: >-
-            List of completion responses, one for each input in the batch
-      additionalProperties: false
-      required:
-        - batch
-      title: BatchCompletionResponse
-      description: >-
-        Response from a batch completion request.
-    CompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        content:
-          type: string
-          description: The generated completion text
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: Reason why generation stopped
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: CompletionResponse
-      description: Response from a completion request.
-    CancelTrainingJobRequest:
-      type: object
-      properties:
-        job_uuid:
-          type: string
-          description: The UUID of the job to cancel.
-      additionalProperties: false
-      required:
-        - job_uuid
-      title: CancelTrainingJobRequest
    ChatCompletionRequest:
      type: object
      properties:
@ -5372,6 +5098,65 @@ components:
        - model_id
        - messages
      title: ChatCompletionRequest
+    ChatCompletionResponse:
+      type: object
+      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
+          description: >-
+            (Optional) List of metrics associated with the API response
+        completion_message:
+          $ref: '#/components/schemas/CompletionMessage'
+          description: The complete response message
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
+      additionalProperties: false
+      required:
+        - completion_message
+      title: ChatCompletionResponse
+      description: Response from a chat completion request.
+    MetricInResponse:
+      type: object
+      properties:
+        metric:
+          type: string
+          description: The name of the metric
+        value:
+          oneOf:
+            - type: integer
+            - type: number
+          description: The numeric value of the metric
+        unit:
+          type: string
+          description: >-
+            (Optional) The unit of measurement for the metric value
+      additionalProperties: false
+      required:
+        - metric
+        - value
+      title: MetricInResponse
+      description: >-
+        A metric value included in API responses.
+    TokenLogProbs:
+      type: object
+      properties:
+        logprobs_by_token:
+          type: object
+          additionalProperties:
+            type: number
+          description: >-
+            Dictionary mapping tokens to their log probabilities
+      additionalProperties: false
+      required:
+        - logprobs_by_token
+      title: TokenLogProbs
+      description: Log probabilities for generated tokens.
    ChatCompletionResponseEvent:
      type: object
      properties:
@ -5549,6 +5334,37 @@ components:
        - model_id
        - content
      title: CompletionRequest
+    CompletionResponse:
+      type: object
+      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
+          description: >-
+            (Optional) List of metrics associated with the API response
+        content:
+          type: string
+          description: The generated completion text
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: Reason why generation stopped
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
+      additionalProperties: false
+      required:
+        - content
+        - stop_reason
+      title: CompletionResponse
+      description: Response from a completion request.
    CompletionResponseStreamChunk:
      type: object
      properties:
@ -13983,18 +13799,6 @@ tags:
      the RAG Tool and Vector IO APIs for more details.
    x-displayName: >-
      Agents API for creating and interacting with agentic systems.
-  - name: BatchInference (Coming Soon)
-    description: >-
-      This is an asynchronous API. If the request is successful, the response will
-      be a job which can be polled for completion.
-
-
-      NOTE: This API is not yet implemented and is subject to change in concert with
-      other asynchronous APIs
-
-      including (post-training, evals, etc).
-    x-displayName: >-
-      Batch inference API for generating completions and chat completions.
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
@ -14037,7 +13841,6 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
-      - BatchInference (Coming Soon)
      - Benchmarks
      - DatasetIO
      - Datasets