mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-31 09:23:54 +00:00
feat: add batch inference API to llama stack inference
This commit is contained in:
parent
ed58a94b30
commit
0cfb2e2473
24 changed files with 1041 additions and 377 deletions
256
docs/_static/llama-stack-spec.yaml
vendored
256
docs/_static/llama-stack-spec.yaml
vendored
|
|
@ -40,7 +40,36 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/AppendRowsRequest'
|
||||
required: true
|
||||
/v1/batch-inference/chat-completion:
|
||||
/v1/inference/batch-chat-completion:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchChatCompletionResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
description: ''
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchChatCompletionRequest'
|
||||
required: true
|
||||
/v1/batch-inference/chat-completion-inline:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
|
|
@ -67,9 +96,38 @@ paths:
|
|||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchChatCompletionRequest'
|
||||
$ref: '#/components/schemas/BatchChatCompletionInlineRequest'
|
||||
required: true
|
||||
/v1/batch-inference/completion:
|
||||
/v1/inference/batch-completion:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchCompletionResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
description: ''
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchCompletionRequest'
|
||||
required: true
|
||||
/v1/batch-inference/completion-inline:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
|
|
@ -96,7 +154,7 @@ paths:
|
|||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/BatchCompletionRequest'
|
||||
$ref: '#/components/schemas/BatchCompletionInlineRequest'
|
||||
required: true
|
||||
/v1/post-training/job/cancel:
|
||||
post:
|
||||
|
|
@ -3009,6 +3067,54 @@ components:
|
|||
- tool_name
|
||||
- arguments
|
||||
title: ToolCall
|
||||
ToolConfig:
|
||||
type: object
|
||||
properties:
|
||||
tool_choice:
|
||||
oneOf:
|
||||
- type: string
|
||||
enum:
|
||||
- auto
|
||||
- required
|
||||
- none
|
||||
title: ToolChoice
|
||||
description: >-
|
||||
Whether tool use is required or automatic. This is a hint to the model
|
||||
which may not be followed. It depends on the Instruction Following
|
||||
capabilities of the model.
|
||||
- type: string
|
||||
default: auto
|
||||
description: >-
|
||||
(Optional) Whether tool use is automatic, required, or none. Can also
|
||||
specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
|
||||
tool_prompt_format:
|
||||
type: string
|
||||
enum:
|
||||
- json
|
||||
- function_tag
|
||||
- python_list
|
||||
description: >-
|
||||
(Optional) Instructs the model how to format tool calls. By default, Llama
|
||||
Stack will attempt to use a format that is best adapted to the model.
|
||||
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
|
||||
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
|
||||
tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
|
||||
syntax -- a list of function calls.
|
||||
system_message_behavior:
|
||||
type: string
|
||||
enum:
|
||||
- append
|
||||
- replace
|
||||
description: >-
|
||||
(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
|
||||
Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
|
||||
Replaces the default system prompt with the provided system message. The
|
||||
system message can include the string '{{function_definitions}}' to indicate
|
||||
where the function definitions should be inserted.
|
||||
default: append
|
||||
additionalProperties: false
|
||||
title: ToolConfig
|
||||
description: Configuration for tool use.
|
||||
ToolDefinition:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -3145,7 +3251,7 @@ components:
|
|||
BatchChatCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
model:
|
||||
model_id:
|
||||
type: string
|
||||
messages_batch:
|
||||
type: array
|
||||
|
|
@ -3159,26 +3265,8 @@ components:
|
|||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/ToolDefinition'
|
||||
tool_choice:
|
||||
type: string
|
||||
enum:
|
||||
- auto
|
||||
- required
|
||||
- none
|
||||
title: ToolChoice
|
||||
description: >-
|
||||
Whether tool use is required or automatic. This is a hint to the model
|
||||
which may not be followed. It depends on the Instruction Following capabilities
|
||||
of the model.
|
||||
tool_prompt_format:
|
||||
type: string
|
||||
enum:
|
||||
- json
|
||||
- function_tag
|
||||
- python_list
|
||||
title: ToolPromptFormat
|
||||
description: >-
|
||||
Prompt format for calling custom / zero shot tools.
|
||||
tool_config:
|
||||
$ref: '#/components/schemas/ToolConfig'
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
logprobs:
|
||||
|
|
@ -3193,7 +3281,7 @@ components:
|
|||
title: LogProbConfig
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- model_id
|
||||
- messages_batch
|
||||
title: BatchChatCompletionRequest
|
||||
BatchChatCompletionResponse:
|
||||
|
|
@ -3258,11 +3346,47 @@ components:
|
|||
- logprobs_by_token
|
||||
title: TokenLogProbs
|
||||
description: Log probabilities for generated tokens.
|
||||
BatchCompletionRequest:
|
||||
BatchChatCompletionInlineRequest:
|
||||
type: object
|
||||
properties:
|
||||
model:
|
||||
type: string
|
||||
messages_batch:
|
||||
type: array
|
||||
items:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/Message'
|
||||
sampling_params:
|
||||
$ref: '#/components/schemas/SamplingParams'
|
||||
tools:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/ToolDefinition'
|
||||
tool_config:
|
||||
$ref: '#/components/schemas/ToolConfig'
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
logprobs:
|
||||
type: object
|
||||
properties:
|
||||
top_k:
|
||||
type: integer
|
||||
default: 0
|
||||
description: >-
|
||||
How many tokens (for each position) to return log probabilities for.
|
||||
additionalProperties: false
|
||||
title: LogProbConfig
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- messages_batch
|
||||
title: BatchChatCompletionInlineRequest
|
||||
BatchCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
model_id:
|
||||
type: string
|
||||
content_batch:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -3283,7 +3407,7 @@ components:
|
|||
title: LogProbConfig
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- model_id
|
||||
- content_batch
|
||||
title: BatchCompletionRequest
|
||||
BatchCompletionResponse:
|
||||
|
|
@ -3326,6 +3450,34 @@ components:
|
|||
- stop_reason
|
||||
title: CompletionResponse
|
||||
description: Response from a completion request.
|
||||
BatchCompletionInlineRequest:
|
||||
type: object
|
||||
properties:
|
||||
model:
|
||||
type: string
|
||||
content_batch:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/InterleavedContent'
|
||||
sampling_params:
|
||||
$ref: '#/components/schemas/SamplingParams'
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
logprobs:
|
||||
type: object
|
||||
properties:
|
||||
top_k:
|
||||
type: integer
|
||||
default: 0
|
||||
description: >-
|
||||
How many tokens (for each position) to return log probabilities for.
|
||||
additionalProperties: false
|
||||
title: LogProbConfig
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- content_batch
|
||||
title: BatchCompletionInlineRequest
|
||||
CancelTrainingJobRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -3335,54 +3487,6 @@ components:
|
|||
required:
|
||||
- job_uuid
|
||||
title: CancelTrainingJobRequest
|
||||
ToolConfig:
|
||||
type: object
|
||||
properties:
|
||||
tool_choice:
|
||||
oneOf:
|
||||
- type: string
|
||||
enum:
|
||||
- auto
|
||||
- required
|
||||
- none
|
||||
title: ToolChoice
|
||||
description: >-
|
||||
Whether tool use is required or automatic. This is a hint to the model
|
||||
which may not be followed. It depends on the Instruction Following
|
||||
capabilities of the model.
|
||||
- type: string
|
||||
default: auto
|
||||
description: >-
|
||||
(Optional) Whether tool use is automatic, required, or none. Can also
|
||||
specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
|
||||
tool_prompt_format:
|
||||
type: string
|
||||
enum:
|
||||
- json
|
||||
- function_tag
|
||||
- python_list
|
||||
description: >-
|
||||
(Optional) Instructs the model how to format tool calls. By default, Llama
|
||||
Stack will attempt to use a format that is best adapted to the model.
|
||||
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
|
||||
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
|
||||
tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
|
||||
syntax -- a list of function calls.
|
||||
system_message_behavior:
|
||||
type: string
|
||||
enum:
|
||||
- append
|
||||
- replace
|
||||
description: >-
|
||||
(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
|
||||
Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
|
||||
Replaces the default system prompt with the provided system message. The
|
||||
system message can include the string '{{function_definitions}}' to indicate
|
||||
where the function definitions should be inserted.
|
||||
default: append
|
||||
additionalProperties: false
|
||||
title: ToolConfig
|
||||
description: Configuration for tool use.
|
||||
ChatCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue