rename toolchain/ --> llama_toolchain/

2025-10-04 12:07:34 +00:00 · 2024-07-21 23:48:38 -07:00 · 2024-07-21 23:48:38 -07:00 · f9111652ef
commit f9111652ef
parent d95f5f863d
73 changed files with 36 additions and 37 deletions
--- a/llama_toolchain/inference/client.py
+++ b/llama_toolchain/inference/client.py
@ -0,0 +1,74 @@
+import asyncio
+import json
+from typing import AsyncGenerator
+
+import fire
+import httpx
+
+from .api import (
+    ChatCompletionRequest,
+    ChatCompletionResponseStreamChunk,
+    CompletionRequest,
+    InstructModel,
+    Inference,
+    UserMessage,
+)
+
+
+class InferenceClient(Inference):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    async def chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
+        async with httpx.AsyncClient() as client:
+            async with client.stream(
+                "POST",
+                f"{self.base_url}/inference/chat_completion",
+                data=request.json(),
+                headers={"Content-Type": "application/json"},
+                timeout=20,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data:"):
+                        data = line[len("data: ") :]
+                        try:
+                            yield ChatCompletionResponseStreamChunk(**json.loads(data))
+                        except Exception as e:
+                            print(data)
+                            print(f"Error with parsing or validation: {e}")
+
+
+async def run_main(host: str, port: int):
+    client = InferenceClient(f"http://{host}:{port}")
+
+    message = UserMessage(content="hello world, help me out here")
+    req = ChatCompletionRequest(
+        model=InstructModel.llama3_70b_chat,
+        messages=[message],
+        stream=True,
+    )
+    async for event in client.chat_completion(
+        ChatCompletionRequest(
+            model=InstructModel.llama3_8b_chat,
+            messages=[message],
+            stream=True,
+        )
+    ):
+        print(event)
+
+
+def main(host: str, port: int):
+    asyncio.run(run_main(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)