feat: add batch inference API to llama stack inference

2025-12-31 04:43:52 +00:00 · 2025-04-08 13:50:52 -07:00 · 2025-04-08 13:50:52 -07:00 · 0cfb2e2473
commit 0cfb2e2473
parent ed58a94b30
24 changed files with 1041 additions and 377 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -85,7 +85,50 @@
                }
            }
        },
-        "/v1/batch-inference/chat-completion": {
+        "/v1/inference/batch-chat-completion": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/BatchChatCompletionResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Inference"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/batch-inference/chat-completion-inline": {
            "post": {
                "responses": {
                    "200": {
@ -120,7 +163,7 @@
                    "content": {
                        "application/json": {
                            "schema": {
-                                "$ref": "#/components/schemas/BatchChatCompletionRequest"
+                                "$ref": "#/components/schemas/BatchChatCompletionInlineRequest"
                            }
                        }
                    },
@ -128,7 +171,50 @@
                }
            }
        },
-        "/v1/batch-inference/completion": {
+        "/v1/inference/batch-completion": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/BatchCompletionResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Inference"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/BatchCompletionRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/batch-inference/completion-inline": {
            "post": {
                "responses": {
                    "200": {
@ -163,7 +249,7 @@
                    "content": {
                        "application/json": {
                            "schema": {
-                                "$ref": "#/components/schemas/BatchCompletionRequest"
+                                "$ref": "#/components/schemas/BatchCompletionInlineRequest"
                            }
                        }
                    },
@ -4366,6 +4452,51 @@
                ],
                "title": "ToolCall"
            },
+            "ToolConfig": {
+                "type": "object",
+                "properties": {
+                    "tool_choice": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": [
+                                    "auto",
+                                    "required",
+                                    "none"
+                                ],
+                                "title": "ToolChoice",
+                                "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ],
+                        "default": "auto",
+                        "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+                    },
+                    "system_message_behavior": {
+                        "type": "string",
+                        "enum": [
+                            "append",
+                            "replace"
+                        ],
+                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+                        "default": "append"
+                    }
+                },
+                "additionalProperties": false,
+                "title": "ToolConfig",
+                "description": "Configuration for tool use."
+            },
            "ToolDefinition": {
                "type": "object",
                "properties": {
@ -4554,7 +4685,7 @@
            "BatchChatCompletionRequest": {
                "type": "object",
                "properties": {
-                    "model": {
+                    "model_id": {
                        "type": "string"
                    },
                    "messages_batch": {
@ -4575,25 +4706,8 @@
                            "$ref": "#/components/schemas/ToolDefinition"
                        }
                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required",
-                            "none"
-                        ],
-                        "title": "ToolChoice",
-                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "title": "ToolPromptFormat",
-                        "description": "Prompt format for calling custom / zero shot tools."
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
                    },
                    "response_format": {
                        "$ref": "#/components/schemas/ResponseFormat"
@ -4613,7 +4727,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "model",
+                    "model_id",
                    "messages_batch"
                ],
                "title": "BatchChatCompletionRequest"
@ -4707,12 +4821,62 @@
                "title": "TokenLogProbs",
                "description": "Log probabilities for generated tokens."
            },
-            "BatchCompletionRequest": {
+            "BatchChatCompletionInlineRequest": {
                "type": "object",
                "properties": {
                    "model": {
                        "type": "string"
                    },
+                    "messages_batch": {
+                        "type": "array",
+                        "items": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/components/schemas/Message"
+                            }
+                        }
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDefinition"
+                        }
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "LogProbConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "messages_batch"
+                ],
+                "title": "BatchChatCompletionInlineRequest"
+            },
+            "BatchCompletionRequest": {
+                "type": "object",
+                "properties": {
+                    "model_id": {
+                        "type": "string"
+                    },
                    "content_batch": {
                        "type": "array",
                        "items": {
@ -4740,7 +4904,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "model",
+                    "model_id",
                    "content_batch"
                ],
                "title": "BatchCompletionRequest"
@ -4799,6 +4963,44 @@
                "title": "CompletionResponse",
                "description": "Response from a completion request."
            },
+            "BatchCompletionInlineRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string"
+                    },
+                    "content_batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/InterleavedContent"
+                        }
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
+                    "logprobs": {
+                        "type": "object",
+                        "properties": {
+                            "top_k": {
+                                "type": "integer",
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "LogProbConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "content_batch"
+                ],
+                "title": "BatchCompletionInlineRequest"
+            },
            "CancelTrainingJobRequest": {
                "type": "object",
                "properties": {
@ -4812,51 +5014,6 @@
                ],
                "title": "CancelTrainingJobRequest"
            },
-            "ToolConfig": {
-                "type": "object",
-                "properties": {
-                    "tool_choice": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "auto",
-                                    "required",
-                                    "none"
-                                ],
-                                "title": "ToolChoice",
-                                "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ],
-                        "default": "auto",
-                        "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
-                    },
-                    "system_message_behavior": {
-                        "type": "string",
-                        "enum": [
-                            "append",
-                            "replace"
-                        ],
-                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
-                        "default": "append"
-                    }
-                },
-                "additionalProperties": false,
-                "title": "ToolConfig",
-                "description": "Configuration for tool use."
-            },
            "ChatCompletionRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -40,7 +40,36 @@ paths:
            schema:
              $ref: '#/components/schemas/AppendRowsRequest'
        required: true
-  /v1/batch-inference/chat-completion:
+  /v1/inference/batch-chat-completion:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/BatchChatCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/BatchChatCompletionRequest'
+        required: true
+  /v1/batch-inference/chat-completion-inline:
    post:
      responses:
        '200':
@ -67,9 +96,38 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BatchChatCompletionRequest'
+              $ref: '#/components/schemas/BatchChatCompletionInlineRequest'
        required: true
-  /v1/batch-inference/completion:
+  /v1/inference/batch-completion:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/BatchCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/BatchCompletionRequest'
+        required: true
+  /v1/batch-inference/completion-inline:
    post:
      responses:
        '200':
@ -96,7 +154,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BatchCompletionRequest'
+              $ref: '#/components/schemas/BatchCompletionInlineRequest'
        required: true
  /v1/post-training/job/cancel:
    post:
@ -3009,6 +3067,54 @@ components:
        - tool_name
        - arguments
      title: ToolCall
+    ToolConfig:
+      type: object
+      properties:
+        tool_choice:
+          oneOf:
+            - type: string
+              enum:
+                - auto
+                - required
+                - none
+              title: ToolChoice
+              description: >-
+                Whether tool use is required or automatic. This is a hint to the model
+                which may not be followed. It depends on the Instruction Following
+                capabilities of the model.
+            - type: string
+          default: auto
+          description: >-
+            (Optional) Whether tool use is automatic, required, or none. Can also
+            specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
+        system_message_behavior:
+          type: string
+          enum:
+            - append
+            - replace
+          description: >-
+            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+            Replaces the default system prompt with the provided system message. The
+            system message can include the string '{{function_definitions}}' to indicate
+            where the function definitions should be inserted.
+          default: append
+      additionalProperties: false
+      title: ToolConfig
+      description: Configuration for tool use.
    ToolDefinition:
      type: object
      properties:
@ -3145,7 +3251,7 @@ components:
    BatchChatCompletionRequest:
      type: object
      properties:
-        model:
+        model_id:
          type: string
        messages_batch:
          type: array
@ -3159,26 +3265,8 @@ components:
          type: array
          items:
            $ref: '#/components/schemas/ToolDefinition'
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-            - none
-          title: ToolChoice
-          description: >-
-            Whether tool use is required or automatic. This is a hint to the model
-            which may not be followed. It depends on the Instruction Following capabilities
-            of the model.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          title: ToolPromptFormat
-          description: >-
-            Prompt format for calling custom / zero shot tools.
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
        logprobs:
@ -3193,7 +3281,7 @@ components:
          title: LogProbConfig
      additionalProperties: false
      required:
-        - model
+        - model_id
        - messages_batch
      title: BatchChatCompletionRequest
    BatchChatCompletionResponse:
@ -3258,11 +3346,47 @@ components:
        - logprobs_by_token
      title: TokenLogProbs
      description: Log probabilities for generated tokens.
-    BatchCompletionRequest:
+    BatchChatCompletionInlineRequest:
      type: object
      properties:
        model:
          type: string
+        messages_batch:
+          type: array
+          items:
+            type: array
+            items:
+              $ref: '#/components/schemas/Message'
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDefinition'
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
+        logprobs:
+          type: object
+          properties:
+            top_k:
+              type: integer
+              default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
+          additionalProperties: false
+          title: LogProbConfig
+      additionalProperties: false
+      required:
+        - model
+        - messages_batch
+      title: BatchChatCompletionInlineRequest
+    BatchCompletionRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
        content_batch:
          type: array
          items:
@ -3283,7 +3407,7 @@ components:
          title: LogProbConfig
      additionalProperties: false
      required:
-        - model
+        - model_id
        - content_batch
      title: BatchCompletionRequest
    BatchCompletionResponse:
@ -3326,6 +3450,34 @@ components:
        - stop_reason
      title: CompletionResponse
      description: Response from a completion request.
+    BatchCompletionInlineRequest:
+      type: object
+      properties:
+        model:
+          type: string
+        content_batch:
+          type: array
+          items:
+            $ref: '#/components/schemas/InterleavedContent'
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
+        logprobs:
+          type: object
+          properties:
+            top_k:
+              type: integer
+              default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
+          additionalProperties: false
+          title: LogProbConfig
+      additionalProperties: false
+      required:
+        - model
+        - content_batch
+      title: BatchCompletionInlineRequest
    CancelTrainingJobRequest:
      type: object
      properties:
@ -3335,54 +3487,6 @@ components:
      required:
        - job_uuid
      title: CancelTrainingJobRequest
-    ToolConfig:
-      type: object
-      properties:
-        tool_choice:
-          oneOf:
-            - type: string
-              enum:
-                - auto
-                - required
-                - none
-              title: ToolChoice
-              description: >-
-                Whether tool use is required or automatic. This is a hint to the model
-                which may not be followed. It depends on the Instruction Following
-                capabilities of the model.
-            - type: string
-          default: auto
-          description: >-
-            (Optional) Whether tool use is automatic, required, or none. Can also
-            specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls.
-        system_message_behavior:
-          type: string
-          enum:
-            - append
-            - replace
-          description: >-
-            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
-            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
-            Replaces the default system prompt with the provided system message. The
-            system message can include the string '{{function_definitions}}' to indicate
-            where the function definitions should be inserted.
-          default: append
-      additionalProperties: false
-      title: ToolConfig
-      description: Configuration for tool use.
    ChatCompletionRequest:
      type: object
      properties: