chore(api): remove batch inference (#3261)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 4s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / build-single-provider (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 1s
API Conformance Tests / check-schema-compatibility (push) Successful in 7s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External API and Providers / test-external (venv) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 39s
Pre-commit / pre-commit (push) Successful in 1m18s

# What does this PR do?

APIs removed:
 - POST /v1/batch-inference/completion
 - POST /v1/batch-inference/chat-completion
 - POST /v1/inference/batch-completion
 - POST /v1/inference/batch-chat-completion

note -
- batch-completion & batch-chat-completion were only implemented for
inference=inline::meta-reference
 - batch-inference were not implemented
This commit is contained in:
Matthew Farrellee 2025-09-26 17:35:34 -04:00 committed by GitHub
parent b48d5cfed7
commit 60484c5c4e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 190 additions and 979 deletions

View file

@ -139,18 +139,7 @@ Methods:
- <code title="post /v1/agents/{agent_id}/session/{session_id}/turn">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">create</a>(session_id, \*, agent_id, \*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn_create_response.py">TurnCreateResponse</a></code>
- <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}">client.agents.turn.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/agents/turn.py">retrieve</a>(turn_id, \*, agent_id, session_id) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/agents/turn.py">Turn</a></code>
## BatchInference
Types:
```python
from llama_stack_client.types import BatchInferenceChatCompletionResponse
```
Methods:
- <code title="post /v1/batch-inference/chat-completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_chat_completion_response.py">BatchInferenceChatCompletionResponse</a></code>
- <code title="post /v1/batch-inference/completion">client.batch_inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>
## Datasets

View file

@ -548,7 +548,6 @@ class Generator:
if op.defining_class.__name__ in [
"SyntheticDataGeneration",
"PostTraining",
"BatchInference",
]:
op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
print(op.defining_class.__name__)

View file

@ -87,94 +87,6 @@
}
}
},
"/v1/inference/batch-chat-completion": {
"post": {
"responses": {
"200": {
"description": "A BatchChatCompletionResponse with the full completions.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchChatCompletionResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"summary": "Generate chat completions for a batch of messages using the specified model.",
"description": "Generate chat completions for a batch of messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchChatCompletionRequest"
}
}
},
"required": true
}
}
},
"/v1/inference/batch-completion": {
"post": {
"responses": {
"200": {
"description": "A BatchCompletionResponse with the full completions.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchCompletionResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"summary": "Generate completions for a batch of content using the specified model.",
"description": "Generate completions for a batch of content using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchCompletionRequest"
}
}
},
"required": true
}
}
},
"/v1alpha/post-training/job/cancel": {
"post": {
"responses": {
@ -281,7 +193,7 @@
}
},
"tags": [
"BatchInference (Coming Soon)"
"Inference"
],
"summary": "Generate a chat completion for the given messages using the specified model.",
"description": "Generate a chat completion for the given messages using the specified model.",
@ -330,7 +242,7 @@
}
},
"tags": [
"BatchInference (Coming Soon)"
"Inference"
],
"summary": "Generate a completion for the given content using the specified model.",
"description": "Generate a completion for the given content using the specified model.",
@ -6346,6 +6258,20 @@
],
"title": "AppendRowsRequest"
},
"CancelTrainingJobRequest": {
"type": "object",
"properties": {
"job_uuid": {
"type": "string",
"description": "The UUID of the job to cancel."
}
},
"additionalProperties": false,
"required": [
"job_uuid"
],
"title": "CancelTrainingJobRequest"
},
"CompletionMessage": {
"type": "object",
"properties": {
@ -7051,26 +6977,23 @@
"title": "UserMessage",
"description": "A message from the user in a chat conversation."
},
"BatchChatCompletionRequest": {
"ChatCompletionRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"messages_batch": {
"messages": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
}
"$ref": "#/components/schemas/Message"
},
"description": "The messages to generate completions for."
"description": "List of messages in the conversation."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "(Optional) Parameters to control the sampling strategy."
"description": "Parameters to control the sampling strategy."
},
"tools": {
"type": "array",
@ -7079,13 +7002,31 @@
},
"description": "(Optional) List of tool definitions available to the model."
},
"tool_config": {
"$ref": "#/components/schemas/ToolConfig",
"description": "(Optional) Configuration for tool use."
"tool_choice": {
"type": "string",
"enum": [
"auto",
"required",
"none"
],
"description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
},
"tool_prompt_format": {
"type": "string",
"enum": [
"json",
"function_tag",
"python_list"
],
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat",
"description": "(Optional) Grammar specification for guided (structured) decoding."
"description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
},
"stream": {
"type": "boolean",
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
@ -7098,32 +7039,18 @@
},
"additionalProperties": false,
"description": "(Optional) If specified, log probabilities for each token position will be returned."
},
"tool_config": {
"$ref": "#/components/schemas/ToolConfig",
"description": "(Optional) Configuration for tool use."
}
},
"additionalProperties": false,
"required": [
"model_id",
"messages_batch"
"messages"
],
"title": "BatchChatCompletionRequest"
},
"BatchChatCompletionResponse": {
"type": "object",
"properties": {
"batch": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionResponse"
},
"description": "List of chat completion responses, one for each conversation in the batch"
}
},
"additionalProperties": false,
"required": [
"batch"
],
"title": "BatchChatCompletionResponse",
"description": "Response from a batch chat completion request."
"title": "ChatCompletionRequest"
},
"ChatCompletionResponse": {
"type": "object",
@ -7203,194 +7130,6 @@
"title": "TokenLogProbs",
"description": "Log probabilities for generated tokens."
},
"BatchCompletionRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"content_batch": {
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContent"
},
"description": "The content to generate completions for."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "(Optional) Parameters to control the sampling strategy."
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat",
"description": "(Optional) Grammar specification for guided (structured) decoding."
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
"description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
"required": [
"model_id",
"content_batch"
],
"title": "BatchCompletionRequest"
},
"BatchCompletionResponse": {
"type": "object",
"properties": {
"batch": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CompletionResponse"
},
"description": "List of completion responses, one for each input in the batch"
}
},
"additionalProperties": false,
"required": [
"batch"
],
"title": "BatchCompletionResponse",
"description": "Response from a batch completion request."
},
"CompletionResponse": {
"type": "object",
"properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"content": {
"type": "string",
"description": "The generated completion text"
},
"stop_reason": {
"type": "string",
"enum": [
"end_of_turn",
"end_of_message",
"out_of_tokens"
],
"description": "Reason why generation stopped"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"content",
"stop_reason"
],
"title": "CompletionResponse",
"description": "Response from a completion request."
},
"CancelTrainingJobRequest": {
"type": "object",
"properties": {
"job_uuid": {
"type": "string",
"description": "The UUID of the job to cancel."
}
},
"additionalProperties": false,
"required": [
"job_uuid"
],
"title": "CancelTrainingJobRequest"
},
"ChatCompletionRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"messages": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
},
"description": "List of messages in the conversation."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "Parameters to control the sampling strategy."
},
"tools": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolDefinition"
},
"description": "(Optional) List of tool definitions available to the model."
},
"tool_choice": {
"type": "string",
"enum": [
"auto",
"required",
"none"
],
"description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
},
"tool_prompt_format": {
"type": "string",
"enum": [
"json",
"function_tag",
"python_list"
],
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat",
"description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
},
"stream": {
"type": "boolean",
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
"description": "(Optional) If specified, log probabilities for each token position will be returned."
},
"tool_config": {
"$ref": "#/components/schemas/ToolConfig",
"description": "(Optional) Configuration for tool use."
}
},
"additionalProperties": false,
"required": [
"model_id",
"messages"
],
"title": "ChatCompletionRequest"
},
"ChatCompletionResponseEvent": {
"type": "object",
"properties": {
@ -7603,6 +7342,45 @@
],
"title": "CompletionRequest"
},
"CompletionResponse": {
"type": "object",
"properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"content": {
"type": "string",
"description": "The generated completion text"
},
"stop_reason": {
"type": "string",
"enum": [
"end_of_turn",
"end_of_message",
"out_of_tokens"
],
"description": "Reason why generation stopped"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"content",
"stop_reason"
],
"title": "CompletionResponse",
"description": "Response from a completion request."
},
"CompletionResponseStreamChunk": {
"type": "object",
"properties": {
@ -18779,11 +18557,6 @@
"description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
"x-displayName": "Agents API for creating and interacting with agentic systems."
},
{
"name": "BatchInference (Coming Soon)",
"description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
"x-displayName": "Batch inference API for generating completions and chat completions."
},
{
"name": "Benchmarks"
},
@ -18858,7 +18631,6 @@
"name": "Operations",
"tags": [
"Agents",
"BatchInference (Coming Soon)",
"Benchmarks",
"DatasetIO",
"Datasets",

View file

@ -43,72 +43,6 @@ paths:
schema:
$ref: '#/components/schemas/AppendRowsRequest'
required: true
/v1/inference/batch-chat-completion:
post:
responses:
'200':
description: >-
A BatchChatCompletionResponse with the full completions.
content:
application/json:
schema:
$ref: '#/components/schemas/BatchChatCompletionResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate chat completions for a batch of messages using the specified model.
description: >-
Generate chat completions for a batch of messages using the specified model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/BatchChatCompletionRequest'
required: true
/v1/inference/batch-completion:
post:
responses:
'200':
description: >-
A BatchCompletionResponse with the full completions.
content:
application/json:
schema:
$ref: '#/components/schemas/BatchCompletionResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate completions for a batch of content using the specified model.
description: >-
Generate completions for a batch of content using the specified model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/BatchCompletionRequest'
required: true
/v1alpha/post-training/job/cancel:
post:
responses:
@ -186,7 +120,7 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- BatchInference (Coming Soon)
- Inference
summary: >-
Generate a chat completion for the given messages using the specified model.
description: >-
@ -223,7 +157,7 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- BatchInference (Coming Soon)
- Inference
summary: >-
Generate a completion for the given content using the specified model.
description: >-
@ -4559,6 +4493,16 @@ components:
required:
- rows
title: AppendRowsRequest
CancelTrainingJobRequest:
type: object
properties:
job_uuid:
type: string
description: The UUID of the job to cancel.
additionalProperties: false
required:
- job_uuid
title: CancelTrainingJobRequest
CompletionMessage:
type: object
properties:
@ -5076,224 +5020,6 @@ components:
title: UserMessage
description: >-
A message from the user in a chat conversation.
BatchChatCompletionRequest:
type: object
properties:
model_id:
type: string
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
messages_batch:
type: array
items:
type: array
items:
$ref: '#/components/schemas/Message'
description: >-
The messages to generate completions for.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
description: >-
(Optional) Parameters to control the sampling strategy.
tools:
type: array
items:
$ref: '#/components/schemas/ToolDefinition'
description: >-
(Optional) List of tool definitions available to the model.
tool_config:
$ref: '#/components/schemas/ToolConfig'
description: (Optional) Configuration for tool use.
response_format:
$ref: '#/components/schemas/ResponseFormat'
description: >-
(Optional) Grammar specification for guided (structured) decoding.
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
description: >-
(Optional) If specified, log probabilities for each token position will
be returned.
additionalProperties: false
required:
- model_id
- messages_batch
title: BatchChatCompletionRequest
BatchChatCompletionResponse:
type: object
properties:
batch:
type: array
items:
$ref: '#/components/schemas/ChatCompletionResponse'
description: >-
List of chat completion responses, one for each conversation in the batch
additionalProperties: false
required:
- batch
title: BatchChatCompletionResponse
description: >-
Response from a batch chat completion request.
ChatCompletionResponse:
type: object
properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
completion_message:
$ref: '#/components/schemas/CompletionMessage'
description: The complete response message
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- completion_message
title: ChatCompletionResponse
description: Response from a chat completion request.
MetricInResponse:
type: object
properties:
metric:
type: string
description: The name of the metric
value:
oneOf:
- type: integer
- type: number
description: The numeric value of the metric
unit:
type: string
description: >-
(Optional) The unit of measurement for the metric value
additionalProperties: false
required:
- metric
- value
title: MetricInResponse
description: >-
A metric value included in API responses.
TokenLogProbs:
type: object
properties:
logprobs_by_token:
type: object
additionalProperties:
type: number
description: >-
Dictionary mapping tokens to their log probabilities
additionalProperties: false
required:
- logprobs_by_token
title: TokenLogProbs
description: Log probabilities for generated tokens.
BatchCompletionRequest:
type: object
properties:
model_id:
type: string
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
content_batch:
type: array
items:
$ref: '#/components/schemas/InterleavedContent'
description: The content to generate completions for.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
description: >-
(Optional) Parameters to control the sampling strategy.
response_format:
$ref: '#/components/schemas/ResponseFormat'
description: >-
(Optional) Grammar specification for guided (structured) decoding.
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
description: >-
(Optional) If specified, log probabilities for each token position will
be returned.
additionalProperties: false
required:
- model_id
- content_batch
title: BatchCompletionRequest
BatchCompletionResponse:
type: object
properties:
batch:
type: array
items:
$ref: '#/components/schemas/CompletionResponse'
description: >-
List of completion responses, one for each input in the batch
additionalProperties: false
required:
- batch
title: BatchCompletionResponse
description: >-
Response from a batch completion request.
CompletionResponse:
type: object
properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
content:
type: string
description: The generated completion text
stop_reason:
type: string
enum:
- end_of_turn
- end_of_message
- out_of_tokens
description: Reason why generation stopped
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- content
- stop_reason
title: CompletionResponse
description: Response from a completion request.
CancelTrainingJobRequest:
type: object
properties:
job_uuid:
type: string
description: The UUID of the job to cancel.
additionalProperties: false
required:
- job_uuid
title: CancelTrainingJobRequest
ChatCompletionRequest:
type: object
properties:
@ -5372,6 +5098,65 @@ components:
- model_id
- messages
title: ChatCompletionRequest
ChatCompletionResponse:
type: object
properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
completion_message:
$ref: '#/components/schemas/CompletionMessage'
description: The complete response message
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- completion_message
title: ChatCompletionResponse
description: Response from a chat completion request.
MetricInResponse:
type: object
properties:
metric:
type: string
description: The name of the metric
value:
oneOf:
- type: integer
- type: number
description: The numeric value of the metric
unit:
type: string
description: >-
(Optional) The unit of measurement for the metric value
additionalProperties: false
required:
- metric
- value
title: MetricInResponse
description: >-
A metric value included in API responses.
TokenLogProbs:
type: object
properties:
logprobs_by_token:
type: object
additionalProperties:
type: number
description: >-
Dictionary mapping tokens to their log probabilities
additionalProperties: false
required:
- logprobs_by_token
title: TokenLogProbs
description: Log probabilities for generated tokens.
ChatCompletionResponseEvent:
type: object
properties:
@ -5549,6 +5334,37 @@ components:
- model_id
- content
title: CompletionRequest
CompletionResponse:
type: object
properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
content:
type: string
description: The generated completion text
stop_reason:
type: string
enum:
- end_of_turn
- end_of_message
- out_of_tokens
description: Reason why generation stopped
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- content
- stop_reason
title: CompletionResponse
description: Response from a completion request.
CompletionResponseStreamChunk:
type: object
properties:
@ -13983,18 +13799,6 @@ tags:
the RAG Tool and Vector IO APIs for more details.
x-displayName: >-
Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon)
description: >-
This is an asynchronous API. If the request is successful, the response will
be a job which can be polled for completion.
NOTE: This API is not yet implemented and is subject to change in concert with
other asynchronous APIs
including (post-training, evals, etc).
x-displayName: >-
Batch inference API for generating completions and chat completions.
- name: Benchmarks
- name: DatasetIO
- name: Datasets
@ -14037,7 +13841,6 @@ x-tagGroups:
- name: Operations
tags:
- Agents
- BatchInference (Coming Soon)
- Benchmarks
- DatasetIO
- Datasets