rename toolchain/ --> llama_toolchain/

2025-10-04 12:07:34 +00:00 · 2024-07-21 23:48:38 -07:00 · 2024-07-21 23:48:38 -07:00 · f9111652ef
commit f9111652ef
parent d95f5f863d
73 changed files with 36 additions and 37 deletions
--- a/toolchain/inference/client.py
+++ b/toolchain/inference/client.py
@ -1,74 +0,0 @@
-import asyncio
-import json
-from typing import AsyncGenerator
-
-import fire
-import httpx
-
-from .api import (
-    ChatCompletionRequest,
-    ChatCompletionResponseStreamChunk,
-    CompletionRequest,
-    InstructModel,
-    Inference,
-    UserMessage,
-)
-
-
-class InferenceClient(Inference):
-    def __init__(self, base_url: str):
-        self.base_url = base_url
-
-    async def initialize(self) -> None:
-        pass
-
-    async def shutdown(self) -> None:
-        pass
-
-    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
-        raise NotImplementedError()
-
-    async def chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
-        async with httpx.AsyncClient() as client:
-            async with client.stream(
-                "POST",
-                f"{self.base_url}/inference/chat_completion",
-                data=request.json(),
-                headers={"Content-Type": "application/json"},
-                timeout=20,
-            ) as response:
-                async for line in response.aiter_lines():
-                    if line.startswith("data:"):
-                        data = line[len("data: ") :]
-                        try:
-                            yield ChatCompletionResponseStreamChunk(**json.loads(data))
-                        except Exception as e:
-                            print(data)
-                            print(f"Error with parsing or validation: {e}")
-
-
-async def run_main(host: str, port: int):
-    client = InferenceClient(f"http://{host}:{port}")
-
-    message = UserMessage(content="hello world, help me out here")
-    req = ChatCompletionRequest(
-        model=InstructModel.llama3_70b_chat,
-        messages=[message],
-        stream=True,
-    )
-    async for event in client.chat_completion(
-        ChatCompletionRequest(
-            model=InstructModel.llama3_8b_chat,
-            messages=[message],
-            stream=True,
-        )
-    ):
-        print(event)
-
-
-def main(host: str, port: int):
-    asyncio.run(run_main(host, port))
-
-
-if __name__ == "__main__":
-    fire.Fire(main)