diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index ff7f492e7..3e9539f41 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -128,49 +128,6 @@
}
}
},
- "/v1/batch-inference/chat-completion-inline": {
- "post": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/BatchChatCompletionResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "BatchInference (Coming Soon)"
- ],
- "description": "",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/BatchChatCompletionInlineRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/inference/batch-completion": {
"post": {
"responses": {
@@ -214,49 +171,6 @@
}
}
},
- "/v1/batch-inference/completion-inline": {
- "post": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/BatchCompletionResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "BatchInference (Coming Soon)"
- ],
- "description": "",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/BatchCompletionInlineRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/post-training/job/cancel": {
"post": {
"responses": {
@@ -325,7 +239,7 @@
}
},
"tags": [
- "Inference"
+ "BatchInference (Coming Soon)"
],
"description": "Generate a chat completion for the given messages using the specified model.",
"parameters": [],
@@ -373,7 +287,7 @@
}
},
"tags": [
- "Inference"
+ "BatchInference (Coming Soon)"
],
"description": "Generate a completion for the given content using the specified model.",
"parameters": [],
@@ -4821,56 +4735,6 @@
"title": "TokenLogProbs",
"description": "Log probabilities for generated tokens."
},
- "BatchChatCompletionInlineRequest": {
- "type": "object",
- "properties": {
- "model": {
- "type": "string"
- },
- "messages_batch": {
- "type": "array",
- "items": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/Message"
- }
- }
- },
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
- },
- "tools": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolDefinition"
- }
- },
- "tool_config": {
- "$ref": "#/components/schemas/ToolConfig"
- },
- "response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
- },
- "logprobs": {
- "type": "object",
- "properties": {
- "top_k": {
- "type": "integer",
- "default": 0,
- "description": "How many tokens (for each position) to return log probabilities for."
- }
- },
- "additionalProperties": false,
- "title": "LogProbConfig"
- }
- },
- "additionalProperties": false,
- "required": [
- "model",
- "messages_batch"
- ],
- "title": "BatchChatCompletionInlineRequest"
- },
"BatchCompletionRequest": {
"type": "object",
"properties": {
@@ -4963,44 +4827,6 @@
"title": "CompletionResponse",
"description": "Response from a completion request."
},
- "BatchCompletionInlineRequest": {
- "type": "object",
- "properties": {
- "model": {
- "type": "string"
- },
- "content_batch": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/InterleavedContent"
- }
- },
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
- },
- "response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
- },
- "logprobs": {
- "type": "object",
- "properties": {
- "top_k": {
- "type": "integer",
- "default": 0,
- "description": "How many tokens (for each position) to return log probabilities for."
- }
- },
- "additionalProperties": false,
- "title": "LogProbConfig"
- }
- },
- "additionalProperties": false,
- "required": [
- "model",
- "content_batch"
- ],
- "title": "BatchCompletionInlineRequest"
- },
"CancelTrainingJobRequest": {
"type": "object",
"properties": {
@@ -11331,7 +11157,9 @@
"x-displayName": "Agents API for creating and interacting with agentic systems."
},
{
- "name": "BatchInference (Coming Soon)"
+ "name": "BatchInference (Coming Soon)",
+ "description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
+ "x-displayName": "Batch inference API for generating completions and chat completions."
},
{
"name": "Benchmarks"
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 279e240ee..0e632fcde 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -69,35 +69,6 @@ paths:
schema:
$ref: '#/components/schemas/BatchChatCompletionRequest'
required: true
- /v1/batch-inference/chat-completion-inline:
- post:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/BatchChatCompletionResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - BatchInference (Coming Soon)
- description: ''
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/BatchChatCompletionInlineRequest'
- required: true
/v1/inference/batch-completion:
post:
responses:
@@ -127,35 +98,6 @@ paths:
schema:
$ref: '#/components/schemas/BatchCompletionRequest'
required: true
- /v1/batch-inference/completion-inline:
- post:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/BatchCompletionResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - BatchInference (Coming Soon)
- description: ''
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/BatchCompletionInlineRequest'
- required: true
/v1/post-training/job/cancel:
post:
responses:
@@ -206,7 +148,7 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- - Inference
+ - BatchInference (Coming Soon)
description: >-
Generate a chat completion for the given messages using the specified model.
parameters: []
@@ -241,7 +183,7 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- - Inference
+ - BatchInference (Coming Soon)
description: >-
Generate a completion for the given content using the specified model.
parameters: []
@@ -3346,42 +3288,6 @@ components:
- logprobs_by_token
title: TokenLogProbs
description: Log probabilities for generated tokens.
- BatchChatCompletionInlineRequest:
- type: object
- properties:
- model:
- type: string
- messages_batch:
- type: array
- items:
- type: array
- items:
- $ref: '#/components/schemas/Message'
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- tools:
- type: array
- items:
- $ref: '#/components/schemas/ToolDefinition'
- tool_config:
- $ref: '#/components/schemas/ToolConfig'
- response_format:
- $ref: '#/components/schemas/ResponseFormat'
- logprobs:
- type: object
- properties:
- top_k:
- type: integer
- default: 0
- description: >-
- How many tokens (for each position) to return log probabilities for.
- additionalProperties: false
- title: LogProbConfig
- additionalProperties: false
- required:
- - model
- - messages_batch
- title: BatchChatCompletionInlineRequest
BatchCompletionRequest:
type: object
properties:
@@ -3450,34 +3356,6 @@ components:
- stop_reason
title: CompletionResponse
description: Response from a completion request.
- BatchCompletionInlineRequest:
- type: object
- properties:
- model:
- type: string
- content_batch:
- type: array
- items:
- $ref: '#/components/schemas/InterleavedContent'
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- response_format:
- $ref: '#/components/schemas/ResponseFormat'
- logprobs:
- type: object
- properties:
- top_k:
- type: integer
- default: 0
- description: >-
- How many tokens (for each position) to return log probabilities for.
- additionalProperties: false
- title: LogProbConfig
- additionalProperties: false
- required:
- - model
- - content_batch
- title: BatchCompletionInlineRequest
CancelTrainingJobRequest:
type: object
properties:
@@ -7737,6 +7615,17 @@ tags:
x-displayName: >-
Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon)
+ description: >-
+ This is an asynchronous API. If the request is successful, the response will
+ be a job which can be polled for completion.
+
+
+ NOTE: This API is not yet implemented and is subject to change in concert with
+ other asynchronous APIs
+
+ including (post-training, evals, etc).
+ x-displayName: >-
+ Batch inference API for generating completions and chat completions.
- name: Benchmarks
- name: DatasetIO
- name: Datasets
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index 57fcd7ebb..7a324128d 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -6,40 +6,50 @@
from typing import List, Optional, Protocol, runtime_checkable
+from llama_stack.apis.common.job_types import Job
from llama_stack.apis.inference import (
- BatchChatCompletionResponse,
- BatchCompletionResponse,
InterleavedContent,
LogProbConfig,
Message,
ResponseFormat,
SamplingParams,
- ToolConfig,
+ ToolChoice,
ToolDefinition,
+ ToolPromptFormat,
)
from llama_stack.schema_utils import webmethod
@runtime_checkable
class BatchInference(Protocol):
- @webmethod(route="/batch-inference/completion-inline", method="POST")
- async def batch_completion_inline(
+ """Batch inference API for generating completions and chat completions.
+
+ This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
+
+ NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
+ including (post-training, evals, etc).
+ """
+
+ @webmethod(route="/batch-inference/completion", method="POST")
+ async def completion(
self,
model: str,
content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = None,
response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
- ) -> BatchCompletionResponse: ...
+ ) -> Job: ...
- @webmethod(route="/batch-inference/chat-completion-inline", method="POST")
- async def batch_chat_completion_inline(
+ @webmethod(route="/batch-inference/chat-completion", method="POST")
+ async def chat_completion(
self,
model: str,
messages_batch: List[List[Message]],
sampling_params: Optional[SamplingParams] = None,
- tools: Optional[List[ToolDefinition]] = list,
- tool_config: Optional[ToolConfig] = None,
+ # zero-shot tool definitions as input to the model
+ tools: Optional[List[ToolDefinition]] = None,
+ tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+ tool_prompt_format: Optional[ToolPromptFormat] = None,
response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
- ) -> BatchChatCompletionResponse: ...
+ ) -> Job: ...
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index faf19a9c6..da5ded0f3 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -5,7 +5,6 @@
# the root directory of this source tree.
import asyncio
-import logging
import os
from typing import AsyncGenerator, List, Optional, Union
@@ -44,6 +43,7 @@ from llama_stack.apis.inference import (
UserMessage,
)
from llama_stack.apis.models import Model, ModelType
+from llama_stack.log import get_logger
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
@@ -72,7 +72,7 @@ from .config import MetaReferenceInferenceConfig
from .generators import LlamaGenerator
from .model_parallel import LlamaModelParallelGenerator
-log = logging.getLogger(__name__)
+log = get_logger(__name__, category="inference")
# there's a single model parallel process running serving the model. for now,
# we don't support multiple concurrent requests to this process.
SEMAPHORE = asyncio.Semaphore(1)
@@ -159,7 +159,7 @@ class MetaReferenceInferenceImpl(
self.model_id = model_id
self.llama_model = llama_model
- print("Warming up...")
+ log.info("Warming up...")
await self.completion(
model_id=model_id,
content="Hello, world!",
@@ -170,7 +170,7 @@ class MetaReferenceInferenceImpl(
messages=[UserMessage(content="Hi how are you?")],
sampling_params=SamplingParams(max_tokens=20),
)
- print("Warmed up!")
+ log.info("Warmed up!")
def check_model(self, request) -> None:
if self.model_id is None or self.llama_model is None: