diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index ff7f492e7..3e9539f41 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -128,49 +128,6 @@ } } }, - "/v1/batch-inference/chat-completion-inline": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/BatchChatCompletionResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "BatchInference (Coming Soon)" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/BatchChatCompletionInlineRequest" - } - } - }, - "required": true - } - } - }, "/v1/inference/batch-completion": { "post": { "responses": { @@ -214,49 +171,6 @@ } } }, - "/v1/batch-inference/completion-inline": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/BatchCompletionResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "BatchInference (Coming Soon)" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/BatchCompletionInlineRequest" - } - } - }, - "required": true - } - } - }, "/v1/post-training/job/cancel": { "post": { "responses": { @@ -325,7 +239,7 @@ } }, "tags": [ - "Inference" + "BatchInference (Coming Soon)" ], "description": "Generate a chat completion for the given messages using the specified model.", "parameters": [], @@ -373,7 +287,7 @@ } }, "tags": [ - "Inference" + "BatchInference (Coming Soon)" ], "description": "Generate a completion for the given content using the specified model.", "parameters": [], @@ -4821,56 +4735,6 @@ "title": "TokenLogProbs", "description": "Log probabilities for generated tokens." }, - "BatchChatCompletionInlineRequest": { - "type": "object", - "properties": { - "model": { - "type": "string" - }, - "messages_batch": { - "type": "array", - "items": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Message" - } - } - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "tools": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolDefinition" - } - }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig" - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" - }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false, - "title": "LogProbConfig" - } - }, - "additionalProperties": false, - "required": [ - "model", - "messages_batch" - ], - "title": "BatchChatCompletionInlineRequest" - }, "BatchCompletionRequest": { "type": "object", "properties": { @@ -4963,44 +4827,6 @@ "title": "CompletionResponse", "description": "Response from a completion request." }, - "BatchCompletionInlineRequest": { - "type": "object", - "properties": { - "model": { - "type": "string" - }, - "content_batch": { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContent" - } - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" - }, - "logprobs": { - "type": "object", - "properties": { - "top_k": { - "type": "integer", - "default": 0, - "description": "How many tokens (for each position) to return log probabilities for." - } - }, - "additionalProperties": false, - "title": "LogProbConfig" - } - }, - "additionalProperties": false, - "required": [ - "model", - "content_batch" - ], - "title": "BatchCompletionInlineRequest" - }, "CancelTrainingJobRequest": { "type": "object", "properties": { @@ -11331,7 +11157,9 @@ "x-displayName": "Agents API for creating and interacting with agentic systems." }, { - "name": "BatchInference (Coming Soon)" + "name": "BatchInference (Coming Soon)", + "description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).", + "x-displayName": "Batch inference API for generating completions and chat completions." }, { "name": "Benchmarks" diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 279e240ee..0e632fcde 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -69,35 +69,6 @@ paths: schema: $ref: '#/components/schemas/BatchChatCompletionRequest' required: true - /v1/batch-inference/chat-completion-inline: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/BatchChatCompletionResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - BatchInference (Coming Soon) - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/BatchChatCompletionInlineRequest' - required: true /v1/inference/batch-completion: post: responses: @@ -127,35 +98,6 @@ paths: schema: $ref: '#/components/schemas/BatchCompletionRequest' required: true - /v1/batch-inference/completion-inline: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/BatchCompletionResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - BatchInference (Coming Soon) - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/BatchCompletionInlineRequest' - required: true /v1/post-training/job/cancel: post: responses: @@ -206,7 +148,7 @@ paths: default: $ref: '#/components/responses/DefaultError' tags: - - Inference + - BatchInference (Coming Soon) description: >- Generate a chat completion for the given messages using the specified model. parameters: [] @@ -241,7 +183,7 @@ paths: default: $ref: '#/components/responses/DefaultError' tags: - - Inference + - BatchInference (Coming Soon) description: >- Generate a completion for the given content using the specified model. parameters: [] @@ -3346,42 +3288,6 @@ components: - logprobs_by_token title: TokenLogProbs description: Log probabilities for generated tokens. - BatchChatCompletionInlineRequest: - type: object - properties: - model: - type: string - messages_batch: - type: array - items: - type: array - items: - $ref: '#/components/schemas/Message' - sampling_params: - $ref: '#/components/schemas/SamplingParams' - tools: - type: array - items: - $ref: '#/components/schemas/ToolDefinition' - tool_config: - $ref: '#/components/schemas/ToolConfig' - response_format: - $ref: '#/components/schemas/ResponseFormat' - logprobs: - type: object - properties: - top_k: - type: integer - default: 0 - description: >- - How many tokens (for each position) to return log probabilities for. - additionalProperties: false - title: LogProbConfig - additionalProperties: false - required: - - model - - messages_batch - title: BatchChatCompletionInlineRequest BatchCompletionRequest: type: object properties: @@ -3450,34 +3356,6 @@ components: - stop_reason title: CompletionResponse description: Response from a completion request. - BatchCompletionInlineRequest: - type: object - properties: - model: - type: string - content_batch: - type: array - items: - $ref: '#/components/schemas/InterleavedContent' - sampling_params: - $ref: '#/components/schemas/SamplingParams' - response_format: - $ref: '#/components/schemas/ResponseFormat' - logprobs: - type: object - properties: - top_k: - type: integer - default: 0 - description: >- - How many tokens (for each position) to return log probabilities for. - additionalProperties: false - title: LogProbConfig - additionalProperties: false - required: - - model - - content_batch - title: BatchCompletionInlineRequest CancelTrainingJobRequest: type: object properties: @@ -7737,6 +7615,17 @@ tags: x-displayName: >- Agents API for creating and interacting with agentic systems. - name: BatchInference (Coming Soon) + description: >- + This is an asynchronous API. If the request is successful, the response will + be a job which can be polled for completion. + + + NOTE: This API is not yet implemented and is subject to change in concert with + other asynchronous APIs + + including (post-training, evals, etc). + x-displayName: >- + Batch inference API for generating completions and chat completions. - name: Benchmarks - name: DatasetIO - name: Datasets diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py index 57fcd7ebb..7a324128d 100644 --- a/llama_stack/apis/batch_inference/batch_inference.py +++ b/llama_stack/apis/batch_inference/batch_inference.py @@ -6,40 +6,50 @@ from typing import List, Optional, Protocol, runtime_checkable +from llama_stack.apis.common.job_types import Job from llama_stack.apis.inference import ( - BatchChatCompletionResponse, - BatchCompletionResponse, InterleavedContent, LogProbConfig, Message, ResponseFormat, SamplingParams, - ToolConfig, + ToolChoice, ToolDefinition, + ToolPromptFormat, ) from llama_stack.schema_utils import webmethod @runtime_checkable class BatchInference(Protocol): - @webmethod(route="/batch-inference/completion-inline", method="POST") - async def batch_completion_inline( + """Batch inference API for generating completions and chat completions. + + This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion. + + NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs + including (post-training, evals, etc). + """ + + @webmethod(route="/batch-inference/completion", method="POST") + async def completion( self, model: str, content_batch: List[InterleavedContent], sampling_params: Optional[SamplingParams] = None, response_format: Optional[ResponseFormat] = None, logprobs: Optional[LogProbConfig] = None, - ) -> BatchCompletionResponse: ... + ) -> Job: ... - @webmethod(route="/batch-inference/chat-completion-inline", method="POST") - async def batch_chat_completion_inline( + @webmethod(route="/batch-inference/chat-completion", method="POST") + async def chat_completion( self, model: str, messages_batch: List[List[Message]], sampling_params: Optional[SamplingParams] = None, - tools: Optional[List[ToolDefinition]] = list, - tool_config: Optional[ToolConfig] = None, + # zero-shot tool definitions as input to the model + tools: Optional[List[ToolDefinition]] = None, + tool_choice: Optional[ToolChoice] = ToolChoice.auto, + tool_prompt_format: Optional[ToolPromptFormat] = None, response_format: Optional[ResponseFormat] = None, logprobs: Optional[LogProbConfig] = None, - ) -> BatchChatCompletionResponse: ... + ) -> Job: ... diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index faf19a9c6..da5ded0f3 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -5,7 +5,6 @@ # the root directory of this source tree. import asyncio -import logging import os from typing import AsyncGenerator, List, Optional, Union @@ -44,6 +43,7 @@ from llama_stack.apis.inference import ( UserMessage, ) from llama_stack.apis.models import Model, ModelType +from llama_stack.log import get_logger from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat @@ -72,7 +72,7 @@ from .config import MetaReferenceInferenceConfig from .generators import LlamaGenerator from .model_parallel import LlamaModelParallelGenerator -log = logging.getLogger(__name__) +log = get_logger(__name__, category="inference") # there's a single model parallel process running serving the model. for now, # we don't support multiple concurrent requests to this process. SEMAPHORE = asyncio.Semaphore(1) @@ -159,7 +159,7 @@ class MetaReferenceInferenceImpl( self.model_id = model_id self.llama_model = llama_model - print("Warming up...") + log.info("Warming up...") await self.completion( model_id=model_id, content="Hello, world!", @@ -170,7 +170,7 @@ class MetaReferenceInferenceImpl( messages=[UserMessage(content="Hi how are you?")], sampling_params=SamplingParams(max_tokens=20), ) - print("Warmed up!") + log.info("Warmed up!") def check_model(self, request) -> None: if self.model_id is None or self.llama_model is None: