diff --git a/llama_toolchain/inference/api/api.py b/llama_toolchain/inference/api/api.py
index 7298cb27b..419e2dafb 100644
--- a/llama_toolchain/inference/api/api.py
+++ b/llama_toolchain/inference/api/api.py
@@ -176,7 +176,15 @@ class Inference(Protocol):
     @webmethod(route="/inference/chat_completion")
     async def chat_completion(
         self,
-        request: ChatCompletionRequest,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        # zero-shot tool definitions as input to the model
+        tools: Optional[List[ToolDefinition]] = list,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
     ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
 
     @webmethod(route="/inference/embeddings")
diff --git a/llama_toolchain/inference/client.py b/llama_toolchain/inference/client.py
index 5ba9314bc..c57433a8f 100644
--- a/llama_toolchain/inference/client.py
+++ b/llama_toolchain/inference/client.py
@@ -10,10 +10,10 @@ from typing import Any, AsyncGenerator
 
 import fire
 import httpx
-from pydantic import BaseModel
-from termcolor import cprint
 
 from llama_toolchain.core.datatypes import RemoteProviderConfig
+from pydantic import BaseModel
+from termcolor import cprint
 
 from .api import (
     ChatCompletionRequest,
@@ -52,9 +52,7 @@ class InferenceClient(Inference):
             async with client.stream(
                 "POST",
                 f"{self.base_url}/inference/chat_completion",
-                json={
-                    "request": encodable_dict(request),
-                },
+                json=encodable_dict(request),
                 headers={"Content-Type": "application/json"},
                 timeout=20,
             ) as response:
diff --git a/llama_toolchain/inference/meta_reference/inference.py b/llama_toolchain/inference/meta_reference/inference.py
index 187d5baae..2cc7ecfa6 100644
--- a/llama_toolchain/inference/meta_reference/inference.py
+++ b/llama_toolchain/inference/meta_reference/inference.py
@@ -22,9 +22,12 @@ from llama_toolchain.inference.api import (
     ToolCallParseStatus,
 )
 from llama_toolchain.inference.prepare_messages import prepare_messages
+
 from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator
 
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_toolchain.inference.api import *  # noqa: F403
 
 # there's a single model parallel process running serving the model. for now,
 # we don't support multiple concurrent requests to this process.
@@ -50,10 +53,29 @@ class MetaReferenceInferenceImpl(Inference):
     # hm, when stream=False, we should not be doing SSE :/ which is what the
     # top-level server is going to do. make the typing more specific here
     async def chat_completion(
-        self, request: ChatCompletionRequest
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
     ) -> AsyncIterator[
         Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
     ]:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
         messages = prepare_messages(request)
         model = resolve_model(request.model)
         if model is None: