fp8 inference

This commit is contained in:
Ashwin Bharambe 2024-07-20 23:13:47 -07:00
parent ad62e2e1f3
commit 0746a0f62b
2 changed files with 23 additions and 9 deletions

View file

@ -5,12 +5,13 @@ from typing import AsyncGenerator
import fire
import httpx
from .api.endpoints import (
from .api import (
ChatCompletionRequest,
ChatCompletionResponseStreamChunk,
CompletionRequest,
InstructModel,
ModelInference,
UserMessage,
)
@ -57,7 +58,7 @@ async def run_main(host: str, port: int):
)
async for event in client.chat_completion(
ChatCompletionRequest(
model=InstructModel.llama3_70b_chat,
model=InstructModel.llama3_8b_chat,
messages=[message],
stream=True,
)