mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore(api): remove batch inference (#3261)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 4s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / build-single-provider (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 1s
API Conformance Tests / check-schema-compatibility (push) Successful in 7s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External API and Providers / test-external (venv) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 39s
Pre-commit / pre-commit (push) Successful in 1m18s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
Vector IO Integration Tests / test-matrix (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 4s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 1s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / build-single-provider (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 1s
API Conformance Tests / check-schema-compatibility (push) Successful in 7s
Python Package Build Test / build (3.13) (push) Failing after 1s
Test External API and Providers / test-external (venv) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 39s
Pre-commit / pre-commit (push) Successful in 1m18s
# What does this PR do? APIs removed: - POST /v1/batch-inference/completion - POST /v1/batch-inference/chat-completion - POST /v1/inference/batch-completion - POST /v1/inference/batch-chat-completion note - - batch-completion & batch-chat-completion were only implemented for inference=inline::meta-reference - batch-inference were not implemented
This commit is contained in:
parent
b48d5cfed7
commit
60484c5c4e
12 changed files with 190 additions and 979 deletions
401
docs/static/llama-stack-spec.yaml
vendored
401
docs/static/llama-stack-spec.yaml
vendored
|
@ -43,72 +43,6 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/AppendRowsRequest'
|
||||
required: true
|
||||
/v1/inference/batch-chat-completion:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
A BatchChatCompletionResponse with the full completions.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchChatCompletionResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
summary: >-
|
||||
Generate chat completions for a batch of messages using the specified model.
|
||||
description: >-
|
||||
Generate chat completions for a batch of messages using the specified model.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchChatCompletionRequest'
|
||||
required: true
|
||||
/v1/inference/batch-completion:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
A BatchCompletionResponse with the full completions.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchCompletionResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
summary: >-
|
||||
Generate completions for a batch of content using the specified model.
|
||||
description: >-
|
||||
Generate completions for a batch of content using the specified model.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchCompletionRequest'
|
||||
required: true
|
||||
/v1alpha/post-training/job/cancel:
|
||||
post:
|
||||
responses:
|
||||
|
@ -186,7 +120,7 @@ paths:
|
|||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- BatchInference (Coming Soon)
|
||||
- Inference
|
||||
summary: >-
|
||||
Generate a chat completion for the given messages using the specified model.
|
||||
description: >-
|
||||
|
@ -223,7 +157,7 @@ paths:
|
|||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- BatchInference (Coming Soon)
|
||||
- Inference
|
||||
summary: >-
|
||||
Generate a completion for the given content using the specified model.
|
||||
description: >-
|
||||
|
@ -4559,6 +4493,16 @@ components:
|
|||
required:
|
||||
- rows
|
||||
title: AppendRowsRequest
|
||||
CancelTrainingJobRequest:
|
||||
type: object
|
||||
properties:
|
||||
job_uuid:
|
||||
type: string
|
||||
description: The UUID of the job to cancel.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- job_uuid
|
||||
title: CancelTrainingJobRequest
|
||||
CompletionMessage:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -5076,224 +5020,6 @@ components:
|
|||
title: UserMessage
|
||||
description: >-
|
||||
A message from the user in a chat conversation.
|
||||
BatchChatCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
model_id:
|
||||
type: string
|
||||
description: >-
|
||||
The identifier of the model to use. The model must be registered with
|
||||
Llama Stack and available via the /models endpoint.
|
||||
messages_batch:
|
||||
type: array
|
||||
items:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/Message'
|
||||
description: >-
|
||||
The messages to generate completions for.
|
||||
sampling_params:
|
||||
$ref: '#/components/schemas/SamplingParams'
|
||||
description: >-
|
||||
(Optional) Parameters to control the sampling strategy.
|
||||
tools:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/ToolDefinition'
|
||||
description: >-
|
||||
(Optional) List of tool definitions available to the model.
|
||||
tool_config:
|
||||
$ref: '#/components/schemas/ToolConfig'
|
||||
description: (Optional) Configuration for tool use.
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
description: >-
|
||||
(Optional) Grammar specification for guided (structured) decoding.
|
||||
logprobs:
|
||||
type: object
|
||||
properties:
|
||||
top_k:
|
||||
type: integer
|
||||
default: 0
|
||||
description: >-
|
||||
How many tokens (for each position) to return log probabilities for.
|
||||
additionalProperties: false
|
||||
description: >-
|
||||
(Optional) If specified, log probabilities for each token position will
|
||||
be returned.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model_id
|
||||
- messages_batch
|
||||
title: BatchChatCompletionRequest
|
||||
BatchChatCompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
batch:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/ChatCompletionResponse'
|
||||
description: >-
|
||||
List of chat completion responses, one for each conversation in the batch
|
||||
additionalProperties: false
|
||||
required:
|
||||
- batch
|
||||
title: BatchChatCompletionResponse
|
||||
description: >-
|
||||
Response from a batch chat completion request.
|
||||
ChatCompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
completion_message:
|
||||
$ref: '#/components/schemas/CompletionMessage'
|
||||
description: The complete response message
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
additionalProperties: false
|
||||
required:
|
||||
- completion_message
|
||||
title: ChatCompletionResponse
|
||||
description: Response from a chat completion request.
|
||||
MetricInResponse:
|
||||
type: object
|
||||
properties:
|
||||
metric:
|
||||
type: string
|
||||
description: The name of the metric
|
||||
value:
|
||||
oneOf:
|
||||
- type: integer
|
||||
- type: number
|
||||
description: The numeric value of the metric
|
||||
unit:
|
||||
type: string
|
||||
description: >-
|
||||
(Optional) The unit of measurement for the metric value
|
||||
additionalProperties: false
|
||||
required:
|
||||
- metric
|
||||
- value
|
||||
title: MetricInResponse
|
||||
description: >-
|
||||
A metric value included in API responses.
|
||||
TokenLogProbs:
|
||||
type: object
|
||||
properties:
|
||||
logprobs_by_token:
|
||||
type: object
|
||||
additionalProperties:
|
||||
type: number
|
||||
description: >-
|
||||
Dictionary mapping tokens to their log probabilities
|
||||
additionalProperties: false
|
||||
required:
|
||||
- logprobs_by_token
|
||||
title: TokenLogProbs
|
||||
description: Log probabilities for generated tokens.
|
||||
BatchCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
model_id:
|
||||
type: string
|
||||
description: >-
|
||||
The identifier of the model to use. The model must be registered with
|
||||
Llama Stack and available via the /models endpoint.
|
||||
content_batch:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/InterleavedContent'
|
||||
description: The content to generate completions for.
|
||||
sampling_params:
|
||||
$ref: '#/components/schemas/SamplingParams'
|
||||
description: >-
|
||||
(Optional) Parameters to control the sampling strategy.
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
description: >-
|
||||
(Optional) Grammar specification for guided (structured) decoding.
|
||||
logprobs:
|
||||
type: object
|
||||
properties:
|
||||
top_k:
|
||||
type: integer
|
||||
default: 0
|
||||
description: >-
|
||||
How many tokens (for each position) to return log probabilities for.
|
||||
additionalProperties: false
|
||||
description: >-
|
||||
(Optional) If specified, log probabilities for each token position will
|
||||
be returned.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model_id
|
||||
- content_batch
|
||||
title: BatchCompletionRequest
|
||||
BatchCompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
batch:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/CompletionResponse'
|
||||
description: >-
|
||||
List of completion responses, one for each input in the batch
|
||||
additionalProperties: false
|
||||
required:
|
||||
- batch
|
||||
title: BatchCompletionResponse
|
||||
description: >-
|
||||
Response from a batch completion request.
|
||||
CompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
content:
|
||||
type: string
|
||||
description: The generated completion text
|
||||
stop_reason:
|
||||
type: string
|
||||
enum:
|
||||
- end_of_turn
|
||||
- end_of_message
|
||||
- out_of_tokens
|
||||
description: Reason why generation stopped
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- stop_reason
|
||||
title: CompletionResponse
|
||||
description: Response from a completion request.
|
||||
CancelTrainingJobRequest:
|
||||
type: object
|
||||
properties:
|
||||
job_uuid:
|
||||
type: string
|
||||
description: The UUID of the job to cancel.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- job_uuid
|
||||
title: CancelTrainingJobRequest
|
||||
ChatCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -5372,6 +5098,65 @@ components:
|
|||
- model_id
|
||||
- messages
|
||||
title: ChatCompletionRequest
|
||||
ChatCompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
completion_message:
|
||||
$ref: '#/components/schemas/CompletionMessage'
|
||||
description: The complete response message
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
additionalProperties: false
|
||||
required:
|
||||
- completion_message
|
||||
title: ChatCompletionResponse
|
||||
description: Response from a chat completion request.
|
||||
MetricInResponse:
|
||||
type: object
|
||||
properties:
|
||||
metric:
|
||||
type: string
|
||||
description: The name of the metric
|
||||
value:
|
||||
oneOf:
|
||||
- type: integer
|
||||
- type: number
|
||||
description: The numeric value of the metric
|
||||
unit:
|
||||
type: string
|
||||
description: >-
|
||||
(Optional) The unit of measurement for the metric value
|
||||
additionalProperties: false
|
||||
required:
|
||||
- metric
|
||||
- value
|
||||
title: MetricInResponse
|
||||
description: >-
|
||||
A metric value included in API responses.
|
||||
TokenLogProbs:
|
||||
type: object
|
||||
properties:
|
||||
logprobs_by_token:
|
||||
type: object
|
||||
additionalProperties:
|
||||
type: number
|
||||
description: >-
|
||||
Dictionary mapping tokens to their log probabilities
|
||||
additionalProperties: false
|
||||
required:
|
||||
- logprobs_by_token
|
||||
title: TokenLogProbs
|
||||
description: Log probabilities for generated tokens.
|
||||
ChatCompletionResponseEvent:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -5549,6 +5334,37 @@ components:
|
|||
- model_id
|
||||
- content
|
||||
title: CompletionRequest
|
||||
CompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
content:
|
||||
type: string
|
||||
description: The generated completion text
|
||||
stop_reason:
|
||||
type: string
|
||||
enum:
|
||||
- end_of_turn
|
||||
- end_of_message
|
||||
- out_of_tokens
|
||||
description: Reason why generation stopped
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- stop_reason
|
||||
title: CompletionResponse
|
||||
description: Response from a completion request.
|
||||
CompletionResponseStreamChunk:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -13983,18 +13799,6 @@ tags:
|
|||
the RAG Tool and Vector IO APIs for more details.
|
||||
x-displayName: >-
|
||||
Agents API for creating and interacting with agentic systems.
|
||||
- name: BatchInference (Coming Soon)
|
||||
description: >-
|
||||
This is an asynchronous API. If the request is successful, the response will
|
||||
be a job which can be polled for completion.
|
||||
|
||||
|
||||
NOTE: This API is not yet implemented and is subject to change in concert with
|
||||
other asynchronous APIs
|
||||
|
||||
including (post-training, evals, etc).
|
||||
x-displayName: >-
|
||||
Batch inference API for generating completions and chat completions.
|
||||
- name: Benchmarks
|
||||
- name: DatasetIO
|
||||
- name: Datasets
|
||||
|
@ -14037,7 +13841,6 @@ x-tagGroups:
|
|||
- name: Operations
|
||||
tags:
|
||||
- Agents
|
||||
- BatchInference (Coming Soon)
|
||||
- Benchmarks
|
||||
- DatasetIO
|
||||
- Datasets
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue