mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
fp8 inference
This commit is contained in:
parent
ad62e2e1f3
commit
0746a0f62b
2 changed files with 23 additions and 9 deletions
|
@ -5,12 +5,13 @@ from typing import AsyncGenerator
|
|||
import fire
|
||||
import httpx
|
||||
|
||||
from .api.endpoints import (
|
||||
from .api import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponseStreamChunk,
|
||||
CompletionRequest,
|
||||
InstructModel,
|
||||
ModelInference,
|
||||
UserMessage,
|
||||
)
|
||||
|
||||
|
||||
|
@ -57,7 +58,7 @@ async def run_main(host: str, port: int):
|
|||
)
|
||||
async for event in client.chat_completion(
|
||||
ChatCompletionRequest(
|
||||
model=InstructModel.llama3_70b_chat,
|
||||
model=InstructModel.llama3_8b_chat,
|
||||
messages=[message],
|
||||
stream=True,
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue