Remove request wrapper migration (#64)

* [1/n] migrate inference/chat_completion * migrate inference/completion * inference/completion * inference regenerate openapi spec * safety api * migrate agentic system * migrate apis without implementations * re-generate openapi spec * remove hack from openapi generator * fix inference * fix inference * openapi generator rerun * Simplified Telemetry API and tying it to logger (#57) * Simplified Telemetry API and tying it to logger * small update which adds a METRIC type * move span events one level down into structured log events --------- Co-authored-by: Ashwin Bharambe <ashwin@meta.com> * fix api to work with openapi generator * fix agentic calling inference * together adapter inference * update inference adapters --------- Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com> Co-authored-by: Ashwin Bharambe <ashwin@meta.com>
2024-09-12 15:03:49 -07:00 · 2024-09-12 15:03:49 -07:00 · 5712566061
commit 5712566061
parent 1d0e91d802
26 changed files with 1211 additions and 3031 deletions
--- a/llama_toolchain/inference/meta_reference/inference.py
+++ b/llama_toolchain/inference/meta_reference/inference.py
@ -22,9 +22,12 @@ from llama_toolchain.inference.api import (
    ToolCallParseStatus,
 )
 from llama_toolchain.inference.prepare_messages import prepare_messages
+
 from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator

+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_toolchain.inference.api import *  # noqa: F403

 # there's a single model parallel process running serving the model. for now,
 # we don't support multiple concurrent requests to this process.
@ -50,10 +53,30 @@ class MetaReferenceInferenceImpl(Inference):
    # hm, when stream=False, we should not be doing SSE :/ which is what the
    # top-level server is going to do. make the typing more specific here
    async def chat_completion(
-        self, request: ChatCompletionRequest
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list(),
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncIterator[
        Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
    ]:
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
        messages = prepare_messages(request)
        model = resolve_model(request.model)
        if model is None: