test: adding an e2e test for measuring TTFT (#1568)

# What does this PR do?

TTFT number largely depends on input length. Ideally we have a
"standard" test that we can use to measure against any llama stack
serving.

TODO: Once JSON is replaced with YAML, I will add "notes" for each test
to explain purpose of each test in place.

## Test plan

Please refer to e2e test doc for setup.
```
LLAMA_STACK_PORT=8322 pytest -v -s --stack-config="http://localhost:8322" \
--text-model="meta-llama/Llama-3.2-3B-Instruct" \
tests/integration/inference/test_text_inference.py::test_text_chat_completion_first_token_profiling
```
This commit is contained in:
LESSuseLESS 2025-03-11 14:41:55 -07:00 committed by GitHub
parent 5f90be5388
commit 2370e826bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 57 additions and 0 deletions

View file

@ -5,6 +5,8 @@
# the root directory of this source tree.
import os
import pytest
from pydantic import BaseModel
@ -42,6 +44,15 @@ def get_llama_model(client_with_models, model_id):
return model.metadata.get("llama_model", None)
def get_llama_tokenizer():
from llama_models.llama3.api.chat_format import ChatFormat
from llama_models.llama3.api.tokenizer import Tokenizer
tokenizer = Tokenizer.get_instance()
formatter = ChatFormat(tokenizer)
return tokenizer, formatter
@pytest.mark.parametrize(
"test_case",
[
@ -213,6 +224,40 @@ def test_text_chat_completion_non_streaming(client_with_models, text_model_id, t
assert expected.lower() in message_content
@pytest.mark.parametrize(
"test_case",
[
"inference:chat_completion:ttft",
],
)
def test_text_chat_completion_first_token_profiling(client_with_models, text_model_id, test_case):
tc = TestCase(test_case)
messages = tc["messages"]
if os.environ.get("DEBUG_TTFT"): # debugging print number of tokens in input, ideally around 800
from pydantic import TypeAdapter
from llama_stack.apis.inference import Message
tokenizer, formatter = get_llama_tokenizer()
typed_messages = [TypeAdapter(Message).validate_python(m) for m in messages]
encoded = formatter.encode_dialog_prompt(typed_messages, None)
raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
response = client_with_models.inference.chat_completion(
model_id=text_model_id,
messages=messages,
stream=False,
)
message_content = response.completion_message.content.lower().strip()
assert len(message_content) > 0
if os.environ.get("DEBUG_TTFT"): # debugging print number of tokens in response, ideally around 150
tokenizer, formatter = get_llama_tokenizer()
encoded = formatter.encode_content(message_content)
raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
@pytest.mark.parametrize(
"test_case",
[