Merge branch 'main' into hide-non-openai-inference-apis

2025-12-20 13:28:41 +00:00 · 2025-09-26 17:48:30 -04:00 · 2025-09-26 17:48:30 -04:00 · 0e78cd5383
commit 0e78cd5383
parent 2657566c1b 60484c5c4e
33 changed files with 2394 additions and 1723 deletions
--- a/llama_stack/apis/batch_inference/init.py
+++ b/llama_stack/apis/batch_inference/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batch_inference import *
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack.apis.common.job_types import Job
-from llama_stack.apis.inference import (
-    InterleavedContent,
-    LogProbConfig,
-    Message,
-    ResponseFormat,
-    SamplingParams,
-    ToolChoice,
-    ToolDefinition,
-    ToolPromptFormat,
-)
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import webmethod
-
-
-@runtime_checkable
-class BatchInference(Protocol):
-    """Batch inference API for generating completions and chat completions.
-
-    This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
-
-    NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
-    including (post-training, evals, etc).
-    """
-
-    @webmethod(route="/batch-inference/completion", method="POST", level=LLAMA_STACK_API_V1)
-    async def completion(
-        self,
-        model: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> Job:
-        """Generate completions for a batch of content.
-
-        :param model: The model to use for the completion.
-        :param content_batch: The content to complete.
-        :param sampling_params: The sampling parameters to use for the completion.
-        :param response_format: The response format to use for the completion.
-        :param logprobs: The logprobs to use for the completion.
-        :returns: A job for the completion.
-        """
-        ...
-
-    @webmethod(route="/batch-inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
-    async def chat_completion(
-        self,
-        model: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        # zero-shot tool definitions as input to the model
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> Job:
-        """Generate chat completions for a batch of messages.
-
-        :param model: The model to use for the chat completion.
-        :param messages_batch: The messages to complete.
-        :param sampling_params: The sampling parameters to use for the completion.
-        :param tools: The tools to use for the chat completion.
-        :param tool_choice: The tool choice to use for the chat completion.
-        :param tool_prompt_format: The tool prompt format to use for the chat completion.
-        :param response_format: The response format to use for the chat completion.
-        :param logprobs: The logprobs to use for the chat completion.
-        :returns: A job for the chat completion.
-        """
-        ...
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -914,6 +914,7 @@ class OpenAIEmbeddingData(BaseModel):
    """

    object: Literal["embedding"] = "embedding"
+    # TODO: consider dropping str and using openai.types.embeddings.Embedding instead of OpenAIEmbeddingData
    embedding: list[float] | str
    index: int

@ -974,26 +975,6 @@ class EmbeddingTaskType(Enum):
    document = "document"


-@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Response from a batch completion request.
-
-    :param batch: List of completion responses, one for each input in the batch
-    """
-
-    batch: list[CompletionResponse]
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    """Response from a batch chat completion request.
-
-    :param batch: List of chat completion responses, one for each conversation in the batch
-    """
-
-    batch: list[ChatCompletionResponse]
-
-
 class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
    input_messages: list[OpenAIMessageParam]

@ -1049,26 +1030,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchCompletionResponse:
-        """Generate completions for a batch of content using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param content_batch: The content to generate completions for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: A BatchCompletionResponse with the full completions.
-        """
-        raise NotImplementedError("Batch completion is not implemented")
-        return  # this is so mypy's safe-super rule will consider the method concrete
-
+    @webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
    async def chat_completion(
        self,
        model_id: str,
@ -1108,30 +1070,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchChatCompletionResponse:
-        """Generate chat completions for a batch of messages using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages_batch: The messages to generate completions for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param tools: (Optional) List of tool definitions available to the model.
-        :param tool_config: (Optional) Configuration for tool use.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: A BatchChatCompletionResponse with the full completions.
-        """
-        raise NotImplementedError("Batch chat completion is not implemented")
-        return  # this is so mypy's safe-super rule will consider the method concrete
-
+    @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def embeddings(
        self,
        model_id: str,