feat: add batch inference API to llama stack inference

2025-12-31 09:23:54 +00:00 · 2025-04-08 13:50:52 -07:00 · 2025-04-08 13:50:52 -07:00 · 0cfb2e2473
commit 0cfb2e2473
parent ed58a94b30
24 changed files with 1041 additions and 377 deletions
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -40,7 +40,36 @@ paths:
            schema:
              $ref: '#/components/schemas/AppendRowsRequest'
        required: true
-  /v1/batch-inference/chat-completion:
+  /v1/inference/batch-chat-completion:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/BatchChatCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/BatchChatCompletionRequest'
+        required: true
+  /v1/batch-inference/chat-completion-inline:
    post:
      responses:
        '200':
@ -67,9 +96,38 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BatchChatCompletionRequest'
+              $ref: '#/components/schemas/BatchChatCompletionInlineRequest'
        required: true
-  /v1/batch-inference/completion:
+  /v1/inference/batch-completion:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/BatchCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/BatchCompletionRequest'
+        required: true
+  /v1/batch-inference/completion-inline:
    post:
      responses:
        '200':
@ -96,7 +154,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BatchCompletionRequest'
+              $ref: '#/components/schemas/BatchCompletionInlineRequest'
        required: true
  /v1/post-training/job/cancel:
    post:
@ -3009,6 +3067,54 @@ components:
        - tool_name
        - arguments
      title: ToolCall
+    ToolConfig:
+      type: object
+      properties:
+        tool_choice:
+          oneOf:
+            - type: string
+              enum:
+                - auto
+                - required
+                - none
+              title: ToolChoice
+              description: >-
+                Whether tool use is required or automatic. This is a hint to the model
+                which may not be followed. It depends on the Instruction Following
+                capabilities of the model.
+            - type: string
+          default: auto
+          description: >-
+            (Optional) Whether tool use is automatic, required, or none. Can also
+            specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
+        system_message_behavior:
+          type: string
+          enum:
+            - append
+            - replace
+          description: >-
+            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+            Replaces the default system prompt with the provided system message. The
+            system message can include the string '{{function_definitions}}' to indicate
+            where the function definitions should be inserted.
+          default: append
+      additionalProperties: false
+      title: ToolConfig
+      description: Configuration for tool use.
    ToolDefinition:
      type: object
      properties:
@ -3145,7 +3251,7 @@ components:
    BatchChatCompletionRequest:
      type: object
      properties:
-        model:
+        model_id:
          type: string
        messages_batch:
          type: array
@ -3159,26 +3265,8 @@ components:
          type: array
          items:
            $ref: '#/components/schemas/ToolDefinition'
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-            - none
-          title: ToolChoice
-          description: >-
-            Whether tool use is required or automatic. This is a hint to the model
-            which may not be followed. It depends on the Instruction Following capabilities
-            of the model.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          title: ToolPromptFormat
-          description: >-
-            Prompt format for calling custom / zero shot tools.
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
        response_format:
          $ref: '#/components/schemas/ResponseFormat'
        logprobs:
@ -3193,7 +3281,7 @@ components:
          title: LogProbConfig
      additionalProperties: false
      required:
-        - model
+        - model_id
        - messages_batch
      title: BatchChatCompletionRequest
    BatchChatCompletionResponse:
@ -3258,11 +3346,47 @@ components:
        - logprobs_by_token
      title: TokenLogProbs
      description: Log probabilities for generated tokens.
-    BatchCompletionRequest:
+    BatchChatCompletionInlineRequest:
      type: object
      properties:
        model:
          type: string
+        messages_batch:
+          type: array
+          items:
+            type: array
+            items:
+              $ref: '#/components/schemas/Message'
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDefinition'
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
+        logprobs:
+          type: object
+          properties:
+            top_k:
+              type: integer
+              default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
+          additionalProperties: false
+          title: LogProbConfig
+      additionalProperties: false
+      required:
+        - model
+        - messages_batch
+      title: BatchChatCompletionInlineRequest
+    BatchCompletionRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
        content_batch:
          type: array
          items:
@ -3283,7 +3407,7 @@ components:
          title: LogProbConfig
      additionalProperties: false
      required:
-        - model
+        - model_id
        - content_batch
      title: BatchCompletionRequest
    BatchCompletionResponse:
@ -3326,6 +3450,34 @@ components:
        - stop_reason
      title: CompletionResponse
      description: Response from a completion request.
+    BatchCompletionInlineRequest:
+      type: object
+      properties:
+        model:
+          type: string
+        content_batch:
+          type: array
+          items:
+            $ref: '#/components/schemas/InterleavedContent'
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
+        logprobs:
+          type: object
+          properties:
+            top_k:
+              type: integer
+              default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
+          additionalProperties: false
+          title: LogProbConfig
+      additionalProperties: false
+      required:
+        - model
+        - content_batch
+      title: BatchCompletionInlineRequest
    CancelTrainingJobRequest:
      type: object
      properties:
@ -3335,54 +3487,6 @@ components:
      required:
        - job_uuid
      title: CancelTrainingJobRequest
-    ToolConfig:
-      type: object
-      properties:
-        tool_choice:
-          oneOf:
-            - type: string
-              enum:
-                - auto
-                - required
-                - none
-              title: ToolChoice
-              description: >-
-                Whether tool use is required or automatic. This is a hint to the model
-                which may not be followed. It depends on the Instruction Following
-                capabilities of the model.
-            - type: string
-          default: auto
-          description: >-
-            (Optional) Whether tool use is automatic, required, or none. Can also
-            specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls.
-        system_message_behavior:
-          type: string
-          enum:
-            - append
-            - replace
-          description: >-
-            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
-            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
-            Replaces the default system prompt with the provided system message. The
-            system message can include the string '{{function_definitions}}' to indicate
-            where the function definitions should be inserted.
-          default: append
-      additionalProperties: false
-      title: ToolConfig
-      description: Configuration for tool use.
    ChatCompletionRequest:
      type: object
      properties: