unit test for inline inference

2025-12-03 09:53:45 +00:00 · 2024-07-30 16:23:47 -07:00 · 2024-07-30 16:23:47 -07:00 · 5b9c05c5dd
commit 5b9c05c5dd
parent cc98fbb058
1 changed files with 114 additions and 0 deletions
--- a/tests/test_inference.py
+++ b/tests/test_inference.py
@ -0,0 +1,114 @@
+# Run this test using the following command:
+# python -m unittest tests/test_inference.py
+
+import os
+import unittest
+
+from llama_models.llama3_1.api.datatypes import (
+    InstructModel,
+    UserMessage
+)
+
+from llama_toolchain.inference.api.config import (
+    ImplType,
+    InferenceConfig,
+    InlineImplConfig,
+    ModelCheckpointConfig,
+    PytorchCheckpoint,
+    CheckpointQuantizationFormat,
+)
+from llama_toolchain.inference.api.datatypes import (
+    ChatCompletionResponseEventType,
+)
+from llama_toolchain.inference.api.endpoints import (
+    ChatCompletionRequest
+)
+from llama_toolchain.inference.inference import InferenceImpl
+from llama_toolchain.inference.event_logger import EventLogger
+
+
+HELPER_MSG = """
+This test needs llama-3.1-8b-instruct models.
+Please donwload using the llama cli
+
+llama download --source huggingface --model-id llama3_1_8b_instruct --hf-token <HF_TOKEN>
+"""
+
+
+class InferenceTests(unittest.IsolatedAsyncioTestCase):
+
+    async def asyncSetUp(self):
+        # assert model exists on local
+        model_dir = os.path.expanduser("~/.llama/checkpoints/Meta-Llama-3.1-8B-Instruct/original/")
+        assert os.path.isdir(model_dir), HELPER_MSG
+
+        tokenizer_path = os.path.join(model_dir, "tokenizer.model")
+        assert os.path.exists(tokenizer_path), HELPER_MSG
+
+        inference_config = InlineImplConfig(
+            checkpoint_config=ModelCheckpointConfig(
+                checkpoint=PytorchCheckpoint(
+                    checkpoint_dir=model_dir,
+                    tokenizer_path=tokenizer_path,
+                    model_parallel_size=1,
+                    quantization_format=CheckpointQuantizationFormat.bf16,
+                )
+            ),
+            max_seq_len=2048,
+        )
+
+        self.inference = InferenceImpl(inference_config)
+        await self.inference.initialize()
+
+    async def asyncTearDown(self):
+        await self.inference.shutdown()
+
+    async def test_inline_inference_no_streaming(self):
+        request = ChatCompletionRequest(
+            model=InstructModel.llama3_8b_chat,
+            messages=[
+                UserMessage(
+                    content="What is the capital of France?",
+                ),
+            ],
+            stream=False,
+        )
+        iterator = self.inference.chat_completion(request)
+
+        async for chunk in iterator:
+            response = chunk
+
+        result = response.completion_message.content
+        self.assertTrue("Paris" in result, result)
+
+    async def test_inline_inference_streaming(self):
+        request = ChatCompletionRequest(
+            model=InstructModel.llama3_8b_chat,
+            messages=[
+                UserMessage(
+                    content="What is the capital of France?",
+                ),
+            ],
+            stream=True,
+        )
+        iterator = self.inference.chat_completion(request)
+
+        events = []
+        async for chunk in iterator:
+            events.append(chunk.event)
+
+
+        self.assertEqual(
+            events[0].event_type,
+            ChatCompletionResponseEventType.start
+        )
+        self.assertEqual(
+            events[-1].event_type,
+            ChatCompletionResponseEventType.complete
+        )
+
+        response = ""
+        for e in events[1:-1]:
+            response += e.delta
+
+        self.assertTrue("Paris" in response, response)