test: adding an e2e test for measuring TTFT (#1568)

# What does this PR do? TTFT number largely depends on input length. Ideally we have a "standard" test that we can use to measure against any llama stack serving. TODO: Once JSON is replaced with YAML, I will add "notes" for each test to explain purpose of each test in place. ## Test plan Please refer to e2e test doc for setup. ``` LLAMA_STACK_PORT=8322 pytest -v -s --stack-config="http://localhost:8322" \ --text-model="meta-llama/Llama-3.2-3B-Instruct" \ tests/integration/inference/test_text_inference.py::test_text_chat_completion_first_token_profiling ```
2025-06-28 02:53:30 +00:00 · 2025-03-11 14:41:55 -07:00 · 2025-03-11 14:41:55 -07:00 · 2370e826bc
commit 2370e826bc
parent 5f90be5388
2 changed files with 57 additions and 0 deletions
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.


+import os
+
 import pytest
 from pydantic import BaseModel

@ -42,6 +44,15 @@ def get_llama_model(client_with_models, model_id):
    return model.metadata.get("llama_model", None)


+def get_llama_tokenizer():
+    from llama_models.llama3.api.chat_format import ChatFormat
+    from llama_models.llama3.api.tokenizer import Tokenizer
+
+    tokenizer = Tokenizer.get_instance()
+    formatter = ChatFormat(tokenizer)
+    return tokenizer, formatter
+
+
@pytest.mark.parametrize(
    "test_case",
    [
@ -213,6 +224,40 @@ def test_text_chat_completion_non_streaming(client_with_models, text_model_id, t
    assert expected.lower() in message_content


+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:ttft",
+    ],
+)
+def test_text_chat_completion_first_token_profiling(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    messages = tc["messages"]
+    if os.environ.get("DEBUG_TTFT"):  # debugging print number of tokens in input, ideally around 800
+        from pydantic import TypeAdapter
+
+        from llama_stack.apis.inference import Message
+
+        tokenizer, formatter = get_llama_tokenizer()
+        typed_messages = [TypeAdapter(Message).validate_python(m) for m in messages]
+        encoded = formatter.encode_dialog_prompt(typed_messages, None)
+        raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=messages,
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+
+    if os.environ.get("DEBUG_TTFT"):  # debugging print number of tokens in response, ideally around 150
+        tokenizer, formatter = get_llama_tokenizer()
+        encoded = formatter.encode_content(message_content)
+        raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
+
+
@pytest.mark.parametrize(
    "test_case",
    [