mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore(api): remove batch inference (#3261)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 4s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / build-single-provider (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 1s
API Conformance Tests / check-schema-compatibility (push) Successful in 7s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External API and Providers / test-external (venv) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 39s
Pre-commit / pre-commit (push) Successful in 1m18s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 4s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / build-single-provider (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 1s
API Conformance Tests / check-schema-compatibility (push) Successful in 7s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External API and Providers / test-external (venv) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 39s
Pre-commit / pre-commit (push) Successful in 1m18s
# What does this PR do? APIs removed: - POST /v1/batch-inference/completion - POST /v1/batch-inference/chat-completion - POST /v1/inference/batch-completion - POST /v1/inference/batch-chat-completion note - - batch-completion & batch-chat-completion were only implemented for inference=inline::meta-reference - batch-inference were not implemented
This commit is contained in:
parent
b48d5cfed7
commit
60484c5c4e
12 changed files with 190 additions and 979 deletions
404
docs/static/llama-stack-spec.html
vendored
404
docs/static/llama-stack-spec.html
vendored
|
@ -87,94 +87,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/inference/batch-chat-completion": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "A BatchChatCompletionResponse with the full completions.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/BatchChatCompletionResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Inference"
|
||||
],
|
||||
"summary": "Generate chat completions for a batch of messages using the specified model.",
|
||||
"description": "Generate chat completions for a batch of messages using the specified model.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/BatchChatCompletionRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/inference/batch-completion": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "A BatchCompletionResponse with the full completions.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/BatchCompletionResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Inference"
|
||||
],
|
||||
"summary": "Generate completions for a batch of content using the specified model.",
|
||||
"description": "Generate completions for a batch of content using the specified model.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/BatchCompletionRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1alpha/post-training/job/cancel": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -281,7 +193,7 @@
|
|||
}
|
||||
},
|
||||
"tags": [
|
||||
"BatchInference (Coming Soon)"
|
||||
"Inference"
|
||||
],
|
||||
"summary": "Generate a chat completion for the given messages using the specified model.",
|
||||
"description": "Generate a chat completion for the given messages using the specified model.",
|
||||
|
@ -330,7 +242,7 @@
|
|||
}
|
||||
},
|
||||
"tags": [
|
||||
"BatchInference (Coming Soon)"
|
||||
"Inference"
|
||||
],
|
||||
"summary": "Generate a completion for the given content using the specified model.",
|
||||
"description": "Generate a completion for the given content using the specified model.",
|
||||
|
@ -6346,6 +6258,20 @@
|
|||
],
|
||||
"title": "AppendRowsRequest"
|
||||
},
|
||||
"CancelTrainingJobRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"job_uuid": {
|
||||
"type": "string",
|
||||
"description": "The UUID of the job to cancel."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"job_uuid"
|
||||
],
|
||||
"title": "CancelTrainingJobRequest"
|
||||
},
|
||||
"CompletionMessage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -7051,26 +6977,23 @@
|
|||
"title": "UserMessage",
|
||||
"description": "A message from the user in a chat conversation."
|
||||
},
|
||||
"BatchChatCompletionRequest": {
|
||||
"ChatCompletionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_id": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
|
||||
},
|
||||
"messages_batch": {
|
||||
"messages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/Message"
|
||||
}
|
||||
"$ref": "#/components/schemas/Message"
|
||||
},
|
||||
"description": "The messages to generate completions for."
|
||||
"description": "List of messages in the conversation."
|
||||
},
|
||||
"sampling_params": {
|
||||
"$ref": "#/components/schemas/SamplingParams",
|
||||
"description": "(Optional) Parameters to control the sampling strategy."
|
||||
"description": "Parameters to control the sampling strategy."
|
||||
},
|
||||
"tools": {
|
||||
"type": "array",
|
||||
|
@ -7079,13 +7002,31 @@
|
|||
},
|
||||
"description": "(Optional) List of tool definitions available to the model."
|
||||
},
|
||||
"tool_config": {
|
||||
"$ref": "#/components/schemas/ToolConfig",
|
||||
"description": "(Optional) Configuration for tool use."
|
||||
"tool_choice": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"auto",
|
||||
"required",
|
||||
"none"
|
||||
],
|
||||
"description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
|
||||
},
|
||||
"tool_prompt_format": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"json",
|
||||
"function_tag",
|
||||
"python_list"
|
||||
],
|
||||
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "(Optional) Grammar specification for guided (structured) decoding."
|
||||
"description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
|
||||
},
|
||||
"stream": {
|
||||
"type": "boolean",
|
||||
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "object",
|
||||
|
@ -7098,32 +7039,18 @@
|
|||
},
|
||||
"additionalProperties": false,
|
||||
"description": "(Optional) If specified, log probabilities for each token position will be returned."
|
||||
},
|
||||
"tool_config": {
|
||||
"$ref": "#/components/schemas/ToolConfig",
|
||||
"description": "(Optional) Configuration for tool use."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"model_id",
|
||||
"messages_batch"
|
||||
"messages"
|
||||
],
|
||||
"title": "BatchChatCompletionRequest"
|
||||
},
|
||||
"BatchChatCompletionResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"batch": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/ChatCompletionResponse"
|
||||
},
|
||||
"description": "List of chat completion responses, one for each conversation in the batch"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"batch"
|
||||
],
|
||||
"title": "BatchChatCompletionResponse",
|
||||
"description": "Response from a batch chat completion request."
|
||||
"title": "ChatCompletionRequest"
|
||||
},
|
||||
"ChatCompletionResponse": {
|
||||
"type": "object",
|
||||
|
@ -7203,194 +7130,6 @@
|
|||
"title": "TokenLogProbs",
|
||||
"description": "Log probabilities for generated tokens."
|
||||
},
|
||||
"BatchCompletionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_id": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
|
||||
},
|
||||
"content_batch": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/InterleavedContent"
|
||||
},
|
||||
"description": "The content to generate completions for."
|
||||
},
|
||||
"sampling_params": {
|
||||
"$ref": "#/components/schemas/SamplingParams",
|
||||
"description": "(Optional) Parameters to control the sampling strategy."
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "(Optional) Grammar specification for guided (structured) decoding."
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"default": 0,
|
||||
"description": "How many tokens (for each position) to return log probabilities for."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "(Optional) If specified, log probabilities for each token position will be returned."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"model_id",
|
||||
"content_batch"
|
||||
],
|
||||
"title": "BatchCompletionRequest"
|
||||
},
|
||||
"BatchCompletionResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"batch": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CompletionResponse"
|
||||
},
|
||||
"description": "List of completion responses, one for each input in the batch"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"batch"
|
||||
],
|
||||
"title": "BatchCompletionResponse",
|
||||
"description": "Response from a batch completion request."
|
||||
},
|
||||
"CompletionResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The generated completion text"
|
||||
},
|
||||
"stop_reason": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"end_of_turn",
|
||||
"end_of_message",
|
||||
"out_of_tokens"
|
||||
],
|
||||
"description": "Reason why generation stopped"
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/TokenLogProbs"
|
||||
},
|
||||
"description": "Optional log probabilities for generated tokens"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"stop_reason"
|
||||
],
|
||||
"title": "CompletionResponse",
|
||||
"description": "Response from a completion request."
|
||||
},
|
||||
"CancelTrainingJobRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"job_uuid": {
|
||||
"type": "string",
|
||||
"description": "The UUID of the job to cancel."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"job_uuid"
|
||||
],
|
||||
"title": "CancelTrainingJobRequest"
|
||||
},
|
||||
"ChatCompletionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_id": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
|
||||
},
|
||||
"messages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/Message"
|
||||
},
|
||||
"description": "List of messages in the conversation."
|
||||
},
|
||||
"sampling_params": {
|
||||
"$ref": "#/components/schemas/SamplingParams",
|
||||
"description": "Parameters to control the sampling strategy."
|
||||
},
|
||||
"tools": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/ToolDefinition"
|
||||
},
|
||||
"description": "(Optional) List of tool definitions available to the model."
|
||||
},
|
||||
"tool_choice": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"auto",
|
||||
"required",
|
||||
"none"
|
||||
],
|
||||
"description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
|
||||
},
|
||||
"tool_prompt_format": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"json",
|
||||
"function_tag",
|
||||
"python_list"
|
||||
],
|
||||
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
|
||||
},
|
||||
"stream": {
|
||||
"type": "boolean",
|
||||
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"default": 0,
|
||||
"description": "How many tokens (for each position) to return log probabilities for."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "(Optional) If specified, log probabilities for each token position will be returned."
|
||||
},
|
||||
"tool_config": {
|
||||
"$ref": "#/components/schemas/ToolConfig",
|
||||
"description": "(Optional) Configuration for tool use."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"model_id",
|
||||
"messages"
|
||||
],
|
||||
"title": "ChatCompletionRequest"
|
||||
},
|
||||
"ChatCompletionResponseEvent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -7603,6 +7342,45 @@
|
|||
],
|
||||
"title": "CompletionRequest"
|
||||
},
|
||||
"CompletionResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The generated completion text"
|
||||
},
|
||||
"stop_reason": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"end_of_turn",
|
||||
"end_of_message",
|
||||
"out_of_tokens"
|
||||
],
|
||||
"description": "Reason why generation stopped"
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/TokenLogProbs"
|
||||
},
|
||||
"description": "Optional log probabilities for generated tokens"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"stop_reason"
|
||||
],
|
||||
"title": "CompletionResponse",
|
||||
"description": "Response from a completion request."
|
||||
},
|
||||
"CompletionResponseStreamChunk": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -18779,11 +18557,6 @@
|
|||
"description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
|
||||
"x-displayName": "Agents API for creating and interacting with agentic systems."
|
||||
},
|
||||
{
|
||||
"name": "BatchInference (Coming Soon)",
|
||||
"description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
|
||||
"x-displayName": "Batch inference API for generating completions and chat completions."
|
||||
},
|
||||
{
|
||||
"name": "Benchmarks"
|
||||
},
|
||||
|
@ -18858,7 +18631,6 @@
|
|||
"name": "Operations",
|
||||
"tags": [
|
||||
"Agents",
|
||||
"BatchInference (Coming Soon)",
|
||||
"Benchmarks",
|
||||
"DatasetIO",
|
||||
"Datasets",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue