Merge branch 'main' into dead_code_removal

2025-12-15 23:32:38 +00:00 · 2025-10-06 13:21:36 -07:00 · 2025-10-06 13:21:36 -07:00 · 9886520b40
commit 9886520b40
parent 3b1add336c 696fefbf17
927 changed files with 171924 additions and 102933 deletions
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -982,45 +982,6 @@ class InferenceProvider(Protocol):

    model_store: ModelStore | None = None

-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: list[Message],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-        tool_config: ToolConfig | None = None,
-    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
-        """Generate a chat completion for the given messages using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation.
-        :param sampling_params: Parameters to control the sampling strategy.
-        :param tools: (Optional) List of tool definitions available to the model.
-        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-            .. deprecated::
-               Use tool_config instead.
-        :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
-            - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
-            .. deprecated::
-               Use tool_config instead.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
-            - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
-            - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
-        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :param tool_config: (Optional) Configuration for tool use.
-        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
-        """
-        ...
-
    @webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def rerank(
        self,
@ -1081,7 +1042,9 @@ class InferenceProvider(Protocol):
        # for fill-in-the-middle type completion
        suffix: str | None = None,
    ) -> OpenAICompletion:
-        """Generate an OpenAI-compatible completion for the given prompt using the specified model.
+        """Create completion.
+
+        Generate an OpenAI-compatible completion for the given prompt using the specified model.

        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param prompt: The prompt to generate a completion for.
@ -1138,7 +1101,9 @@ class InferenceProvider(Protocol):
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        """Generate an OpenAI-compatible chat completion for the given messages using the specified model.
+        """Create chat completions.
+
+        Generate an OpenAI-compatible chat completion for the given messages using the specified model.

        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param messages: List of messages in the conversation.
@ -1182,7 +1147,9 @@ class InferenceProvider(Protocol):
        dimensions: int | None = None,
        user: str | None = None,
    ) -> OpenAIEmbeddingsResponse:
-        """Generate OpenAI-compatible embeddings for the given input using the specified model.
+        """Create embeddings.
+
+        Generate OpenAI-compatible embeddings for the given input using the specified model.

        :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
        :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
@ -1195,7 +1162,9 @@ class InferenceProvider(Protocol):


 class Inference(InferenceProvider):
-    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+    """Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
@ -1216,7 +1185,7 @@ class Inference(InferenceProvider):
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIChatCompletionResponse:
-        """List all chat completions.
+        """List chat completions.

        :param after: The ID of the last chat completion to return.
        :param limit: The maximum number of chat completions to return.
@ -1237,10 +1206,11 @@ class Inference(InferenceProvider):
        method="GET",
        level=LLAMA_STACK_API_V1,
    )
-    async def get_chat_completion(
-        self, completion_id: str
-    ) -> OpenAICompletionWithInputMessages:
-        """Describe a chat completion by its ID.
+    @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        """Get chat completion.
+
+        Describe a chat completion by its ID.

        :param completion_id: ID of the chat completion.
        :returns: A OpenAICompletionWithInputMessages.