fp8 inference

2025-10-04 04:04:14 +00:00 · 2024-07-20 23:13:47 -07:00 · 2024-07-20 23:13:47 -07:00 · 0746a0f62b
commit 0746a0f62b
parent ad62e2e1f3
2 changed files with 23 additions and 9 deletions
--- a/toolchain/inference/client.py
+++ b/toolchain/inference/client.py
@ -5,12 +5,13 @@ from typing import AsyncGenerator
 import fire
 import httpx

-from .api.endpoints import (
+from .api import (
    ChatCompletionRequest,
    ChatCompletionResponseStreamChunk,
    CompletionRequest,
    InstructModel,
    ModelInference,
+    UserMessage,
 )


@ -57,7 +58,7 @@ async def run_main(host: str, port: int):
    )
    async for event in client.chat_completion(
        ChatCompletionRequest(
-            model=InstructModel.llama3_70b_chat,
+            model=InstructModel.llama3_8b_chat,
            messages=[message],
            stream=True,
        )