This commit is contained in:
Ashwin Bharambe 2025-04-11 16:15:59 -07:00
parent 0cfb2e2473
commit 73d927850e
4 changed files with 43 additions and 316 deletions

View file

@ -128,49 +128,6 @@
} }
} }
}, },
"/v1/batch-inference/chat-completion-inline": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchChatCompletionResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"BatchInference (Coming Soon)"
],
"description": "",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchChatCompletionInlineRequest"
}
}
},
"required": true
}
}
},
"/v1/inference/batch-completion": { "/v1/inference/batch-completion": {
"post": { "post": {
"responses": { "responses": {
@ -214,49 +171,6 @@
} }
} }
}, },
"/v1/batch-inference/completion-inline": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchCompletionResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"BatchInference (Coming Soon)"
],
"description": "",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BatchCompletionInlineRequest"
}
}
},
"required": true
}
}
},
"/v1/post-training/job/cancel": { "/v1/post-training/job/cancel": {
"post": { "post": {
"responses": { "responses": {
@ -325,7 +239,7 @@
} }
}, },
"tags": [ "tags": [
"Inference" "BatchInference (Coming Soon)"
], ],
"description": "Generate a chat completion for the given messages using the specified model.", "description": "Generate a chat completion for the given messages using the specified model.",
"parameters": [], "parameters": [],
@ -373,7 +287,7 @@
} }
}, },
"tags": [ "tags": [
"Inference" "BatchInference (Coming Soon)"
], ],
"description": "Generate a completion for the given content using the specified model.", "description": "Generate a completion for the given content using the specified model.",
"parameters": [], "parameters": [],
@ -4821,56 +4735,6 @@
"title": "TokenLogProbs", "title": "TokenLogProbs",
"description": "Log probabilities for generated tokens." "description": "Log probabilities for generated tokens."
}, },
"BatchChatCompletionInlineRequest": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"messages_batch": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
}
}
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams"
},
"tools": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolDefinition"
}
},
"tool_config": {
"$ref": "#/components/schemas/ToolConfig"
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat"
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
"title": "LogProbConfig"
}
},
"additionalProperties": false,
"required": [
"model",
"messages_batch"
],
"title": "BatchChatCompletionInlineRequest"
},
"BatchCompletionRequest": { "BatchCompletionRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -4963,44 +4827,6 @@
"title": "CompletionResponse", "title": "CompletionResponse",
"description": "Response from a completion request." "description": "Response from a completion request."
}, },
"BatchCompletionInlineRequest": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"content_batch": {
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContent"
}
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams"
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat"
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
"title": "LogProbConfig"
}
},
"additionalProperties": false,
"required": [
"model",
"content_batch"
],
"title": "BatchCompletionInlineRequest"
},
"CancelTrainingJobRequest": { "CancelTrainingJobRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -11331,7 +11157,9 @@
"x-displayName": "Agents API for creating and interacting with agentic systems." "x-displayName": "Agents API for creating and interacting with agentic systems."
}, },
{ {
"name": "BatchInference (Coming Soon)" "name": "BatchInference (Coming Soon)",
"description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
"x-displayName": "Batch inference API for generating completions and chat completions."
}, },
{ {
"name": "Benchmarks" "name": "Benchmarks"

View file

@ -69,35 +69,6 @@ paths:
schema: schema:
$ref: '#/components/schemas/BatchChatCompletionRequest' $ref: '#/components/schemas/BatchChatCompletionRequest'
required: true required: true
/v1/batch-inference/chat-completion-inline:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/BatchChatCompletionResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- BatchInference (Coming Soon)
description: ''
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/BatchChatCompletionInlineRequest'
required: true
/v1/inference/batch-completion: /v1/inference/batch-completion:
post: post:
responses: responses:
@ -127,35 +98,6 @@ paths:
schema: schema:
$ref: '#/components/schemas/BatchCompletionRequest' $ref: '#/components/schemas/BatchCompletionRequest'
required: true required: true
/v1/batch-inference/completion-inline:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/BatchCompletionResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- BatchInference (Coming Soon)
description: ''
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/BatchCompletionInlineRequest'
required: true
/v1/post-training/job/cancel: /v1/post-training/job/cancel:
post: post:
responses: responses:
@ -206,7 +148,7 @@ paths:
default: default:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - BatchInference (Coming Soon)
description: >- description: >-
Generate a chat completion for the given messages using the specified model. Generate a chat completion for the given messages using the specified model.
parameters: [] parameters: []
@ -241,7 +183,7 @@ paths:
default: default:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - BatchInference (Coming Soon)
description: >- description: >-
Generate a completion for the given content using the specified model. Generate a completion for the given content using the specified model.
parameters: [] parameters: []
@ -3346,42 +3288,6 @@ components:
- logprobs_by_token - logprobs_by_token
title: TokenLogProbs title: TokenLogProbs
description: Log probabilities for generated tokens. description: Log probabilities for generated tokens.
BatchChatCompletionInlineRequest:
type: object
properties:
model:
type: string
messages_batch:
type: array
items:
type: array
items:
$ref: '#/components/schemas/Message'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
tools:
type: array
items:
$ref: '#/components/schemas/ToolDefinition'
tool_config:
$ref: '#/components/schemas/ToolConfig'
response_format:
$ref: '#/components/schemas/ResponseFormat'
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
title: LogProbConfig
additionalProperties: false
required:
- model
- messages_batch
title: BatchChatCompletionInlineRequest
BatchCompletionRequest: BatchCompletionRequest:
type: object type: object
properties: properties:
@ -3450,34 +3356,6 @@ components:
- stop_reason - stop_reason
title: CompletionResponse title: CompletionResponse
description: Response from a completion request. description: Response from a completion request.
BatchCompletionInlineRequest:
type: object
properties:
model:
type: string
content_batch:
type: array
items:
$ref: '#/components/schemas/InterleavedContent'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
response_format:
$ref: '#/components/schemas/ResponseFormat'
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
title: LogProbConfig
additionalProperties: false
required:
- model
- content_batch
title: BatchCompletionInlineRequest
CancelTrainingJobRequest: CancelTrainingJobRequest:
type: object type: object
properties: properties:
@ -7737,6 +7615,17 @@ tags:
x-displayName: >- x-displayName: >-
Agents API for creating and interacting with agentic systems. Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon) - name: BatchInference (Coming Soon)
description: >-
This is an asynchronous API. If the request is successful, the response will
be a job which can be polled for completion.
NOTE: This API is not yet implemented and is subject to change in concert with
other asynchronous APIs
including (post-training, evals, etc).
x-displayName: >-
Batch inference API for generating completions and chat completions.
- name: Benchmarks - name: Benchmarks
- name: DatasetIO - name: DatasetIO
- name: Datasets - name: Datasets

View file

@ -6,40 +6,50 @@
from typing import List, Optional, Protocol, runtime_checkable from typing import List, Optional, Protocol, runtime_checkable
from llama_stack.apis.common.job_types import Job
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
BatchChatCompletionResponse,
BatchCompletionResponse,
InterleavedContent, InterleavedContent,
LogProbConfig, LogProbConfig,
Message, Message,
ResponseFormat, ResponseFormat,
SamplingParams, SamplingParams,
ToolConfig, ToolChoice,
ToolDefinition, ToolDefinition,
ToolPromptFormat,
) )
from llama_stack.schema_utils import webmethod from llama_stack.schema_utils import webmethod
@runtime_checkable @runtime_checkable
class BatchInference(Protocol): class BatchInference(Protocol):
@webmethod(route="/batch-inference/completion-inline", method="POST") """Batch inference API for generating completions and chat completions.
async def batch_completion_inline(
This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
including (post-training, evals, etc).
"""
@webmethod(route="/batch-inference/completion", method="POST")
async def completion(
self, self,
model: str, model: str,
content_batch: List[InterleavedContent], content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = None, sampling_params: Optional[SamplingParams] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ... ) -> Job: ...
@webmethod(route="/batch-inference/chat-completion-inline", method="POST") @webmethod(route="/batch-inference/chat-completion", method="POST")
async def batch_chat_completion_inline( async def chat_completion(
self, self,
model: str, model: str,
messages_batch: List[List[Message]], messages_batch: List[List[Message]],
sampling_params: Optional[SamplingParams] = None, sampling_params: Optional[SamplingParams] = None,
tools: Optional[List[ToolDefinition]] = list, # zero-shot tool definitions as input to the model
tool_config: Optional[ToolConfig] = None, tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> BatchChatCompletionResponse: ... ) -> Job: ...

View file

@ -5,7 +5,6 @@
# the root directory of this source tree. # the root directory of this source tree.
import asyncio import asyncio
import logging
import os import os
from typing import AsyncGenerator, List, Optional, Union from typing import AsyncGenerator, List, Optional, Union
@ -44,6 +43,7 @@ from llama_stack.apis.inference import (
UserMessage, UserMessage,
) )
from llama_stack.apis.models import Model, ModelType from llama_stack.apis.models import Model, ModelType
from llama_stack.log import get_logger
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
@ -72,7 +72,7 @@ from .config import MetaReferenceInferenceConfig
from .generators import LlamaGenerator from .generators import LlamaGenerator
from .model_parallel import LlamaModelParallelGenerator from .model_parallel import LlamaModelParallelGenerator
log = logging.getLogger(__name__) log = get_logger(__name__, category="inference")
# there's a single model parallel process running serving the model. for now, # there's a single model parallel process running serving the model. for now,
# we don't support multiple concurrent requests to this process. # we don't support multiple concurrent requests to this process.
SEMAPHORE = asyncio.Semaphore(1) SEMAPHORE = asyncio.Semaphore(1)
@ -159,7 +159,7 @@ class MetaReferenceInferenceImpl(
self.model_id = model_id self.model_id = model_id
self.llama_model = llama_model self.llama_model = llama_model
print("Warming up...") log.info("Warming up...")
await self.completion( await self.completion(
model_id=model_id, model_id=model_id,
content="Hello, world!", content="Hello, world!",
@ -170,7 +170,7 @@ class MetaReferenceInferenceImpl(
messages=[UserMessage(content="Hi how are you?")], messages=[UserMessage(content="Hi how are you?")],
sampling_params=SamplingParams(max_tokens=20), sampling_params=SamplingParams(max_tokens=20),
) )
print("Warmed up!") log.info("Warmed up!")
def check_model(self, request) -> None: def check_model(self, request) -> None:
if self.model_id is None or self.llama_model is None: if self.model_id is None or self.llama_model is None: