[1/n] migrate inference/chat_completion

2025-12-03 09:53:45 +00:00 · 2024-09-11 12:21:19 -07:00 · 2024-09-11 12:21:19 -07:00 · 0c7c6b7e02
commit 0c7c6b7e02
parent 1433aaf9f7
3 changed files with 35 additions and 7 deletions
--- a/llama_toolchain/inference/api/api.py
+++ b/llama_toolchain/inference/api/api.py
@ -176,7 +176,15 @@ class Inference(Protocol):
    @webmethod(route="/inference/chat_completion")
    async def chat_completion(
        self,
-        request: ChatCompletionRequest,
+        model: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        # zero-shot tool definitions as input to the model
        tools: Optional[List[ToolDefinition]] = list,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
    @webmethod(route="/inference/embeddings")
--- a/llama_toolchain/inference/client.py
+++ b/llama_toolchain/inference/client.py
@ -10,10 +10,10 @@ from typing import Any, AsyncGenerator
 import fire
 import httpx
 from pydantic import BaseModel
 from termcolor import cprint
 from llama_toolchain.core.datatypes import RemoteProviderConfig
 from pydantic import BaseModel
 from termcolor import cprint
 from .api import (
    ChatCompletionRequest,
@ -52,9 +52,7 @@ class InferenceClient(Inference):
            async with client.stream(
                "POST",
                f"{self.base_url}/inference/chat_completion",
-                json={
+                json=encodable_dict(request),
                    "request": encodable_dict(request),
                },
                headers={"Content-Type": "application/json"},
                timeout=20,
            ) as response:
--- a/llama_toolchain/inference/meta_reference/inference.py
+++ b/llama_toolchain/inference/meta_reference/inference.py
@ -22,9 +22,12 @@ from llama_toolchain.inference.api import (
    ToolCallParseStatus,
 )
 from llama_toolchain.inference.prepare_messages import prepare_messages
 from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_toolchain.inference.api import *  # noqa: F403
 # there's a single model parallel process running serving the model. for now,
 # we don't support multiple concurrent requests to this process.
@ -50,10 +53,29 @@ class MetaReferenceInferenceImpl(Inference):
    # hm, when stream=False, we should not be doing SSE :/ which is what the
    # top-level server is going to do. make the typing more specific here
    async def chat_completion(
-        self, request: ChatCompletionRequest
+        self,
        model: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        tools: Optional[List[ToolDefinition]] = list,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncIterator[
        Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
    ]:
        request = ChatCompletionRequest(
            model=model,
            messages=messages,
            sampling_params=sampling_params,
            tools=tools,
            tool_choice=tool_choice,
            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
        )
        messages = prepare_messages(request)
        model = resolve_model(request.model)
        if model is None: