feat: add batch inference API to llama stack inference (#1945)

# What does this PR do?

This PR adds two methods to the Inference API:
- `batch_completion`
- `batch_chat_completion`

The motivation is for evaluations targeting a local inference engine
(like meta-reference or vllm) where batch APIs provide for a substantial
amount of acceleration.

Why did I not add this to `Api.batch_inference` though? That just
resulted in a _lot_ more book-keeping given the structure of Llama
Stack. Had I done that, I would have needed to create a notion of a
"batch model" resource, setup routing based on that, etc. This does not
sound ideal.

So what's the future of the batch inference API? I am not sure. Maybe we
can keep it for true _asynchronous_ execution. So you can submit
requests, and it can return a Job instance, etc.

## Test Plan

Run meta-reference-gpu using:
```bash
export INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct
export INFERENCE_CHECKPOINT_DIR=../checkpoints/Llama-4-Scout-17B-16E-Instruct-20250331210000
export MODEL_PARALLEL_SIZE=4
export MAX_BATCH_SIZE=32
export MAX_SEQ_LEN=6144

LLAMA_MODELS_DEBUG=1 llama stack run meta-reference-gpu
```

Then run the batch inference test case.
This commit is contained in:
Ashwin Bharambe 2025-04-12 11:41:12 -07:00 committed by GitHub
parent 854c2ad264
commit f34f22f8c7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 698 additions and 389 deletions

View file

@ -85,7 +85,7 @@
}
}
},
"/v1/batch-inference/chat-completion": {
"/v1/inference/batch-chat-completion": {
"post": {
"responses": {
"200": {
@ -112,7 +112,7 @@
}
},
"tags": [
"BatchInference (Coming Soon)"
"Inference"
],
"description": "",
"parameters": [],
@ -128,7 +128,7 @@
}
}
},
"/v1/batch-inference/completion": {
"/v1/inference/batch-completion": {
"post": {
"responses": {
"200": {
@ -155,7 +155,7 @@
}
},
"tags": [
"BatchInference (Coming Soon)"
"Inference"
],
"description": "",
"parameters": [],
@ -239,7 +239,7 @@
}
},
"tags": [
"Inference"
"BatchInference (Coming Soon)"
],
"description": "Generate a chat completion for the given messages using the specified model.",
"parameters": [],
@ -287,7 +287,7 @@
}
},
"tags": [
"Inference"
"BatchInference (Coming Soon)"
],
"description": "Generate a completion for the given content using the specified model.",
"parameters": [],
@ -4366,6 +4366,51 @@
],
"title": "ToolCall"
},
"ToolConfig": {
"type": "object",
"properties": {
"tool_choice": {
"oneOf": [
{
"type": "string",
"enum": [
"auto",
"required",
"none"
],
"title": "ToolChoice",
"description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
},
{
"type": "string"
}
],
"default": "auto",
"description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
},
"tool_prompt_format": {
"type": "string",
"enum": [
"json",
"function_tag",
"python_list"
],
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
},
"system_message_behavior": {
"type": "string",
"enum": [
"append",
"replace"
],
"description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
"default": "append"
}
},
"additionalProperties": false,
"title": "ToolConfig",
"description": "Configuration for tool use."
},
"ToolDefinition": {
"type": "object",
"properties": {
@ -4554,7 +4599,7 @@
"BatchChatCompletionRequest": {
"type": "object",
"properties": {
"model": {
"model_id": {
"type": "string"
},
"messages_batch": {
@ -4575,25 +4620,8 @@
"$ref": "#/components/schemas/ToolDefinition"
}
},
"tool_choice": {
"type": "string",
"enum": [
"auto",
"required",
"none"
],
"title": "ToolChoice",
"description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
},
"tool_prompt_format": {
"type": "string",
"enum": [
"json",
"function_tag",
"python_list"
],
"title": "ToolPromptFormat",
"description": "Prompt format for calling custom / zero shot tools."
"tool_config": {
"$ref": "#/components/schemas/ToolConfig"
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat"
@ -4613,7 +4641,7 @@
},
"additionalProperties": false,
"required": [
"model",
"model_id",
"messages_batch"
],
"title": "BatchChatCompletionRequest"
@ -4710,7 +4738,7 @@
"BatchCompletionRequest": {
"type": "object",
"properties": {
"model": {
"model_id": {
"type": "string"
},
"content_batch": {
@ -4740,7 +4768,7 @@
},
"additionalProperties": false,
"required": [
"model",
"model_id",
"content_batch"
],
"title": "BatchCompletionRequest"
@ -4812,51 +4840,6 @@
],
"title": "CancelTrainingJobRequest"
},
"ToolConfig": {
"type": "object",
"properties": {
"tool_choice": {
"oneOf": [
{
"type": "string",
"enum": [
"auto",
"required",
"none"
],
"title": "ToolChoice",
"description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
},
{
"type": "string"
}
],
"default": "auto",
"description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
},
"tool_prompt_format": {
"type": "string",
"enum": [
"json",
"function_tag",
"python_list"
],
"description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
},
"system_message_behavior": {
"type": "string",
"enum": [
"append",
"replace"
],
"description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
"default": "append"
}
},
"additionalProperties": false,
"title": "ToolConfig",
"description": "Configuration for tool use."
},
"ChatCompletionRequest": {
"type": "object",
"properties": {
@ -11173,7 +11156,9 @@
"x-displayName": "Agents API for creating and interacting with agentic systems."
},
{
"name": "BatchInference (Coming Soon)"
"name": "BatchInference (Coming Soon)",
"description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
"x-displayName": "Batch inference API for generating completions and chat completions."
},
{
"name": "Benchmarks"