mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
test: adding an e2e test for measuring TTFT (#1568)
# What does this PR do? TTFT number largely depends on input length. Ideally we have a "standard" test that we can use to measure against any llama stack serving. TODO: Once JSON is replaced with YAML, I will add "notes" for each test to explain purpose of each test in place. ## Test plan Please refer to e2e test doc for setup. ``` LLAMA_STACK_PORT=8322 pytest -v -s --stack-config="http://localhost:8322" \ --text-model="meta-llama/Llama-3.2-3B-Instruct" \ tests/integration/inference/test_text_inference.py::test_text_chat_completion_first_token_profiling ```
This commit is contained in:
parent
5f90be5388
commit
2370e826bc
2 changed files with 57 additions and 0 deletions
|
@ -5,6 +5,8 @@
|
|||
# the root directory of this source tree.
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
@ -42,6 +44,15 @@ def get_llama_model(client_with_models, model_id):
|
|||
return model.metadata.get("llama_model", None)
|
||||
|
||||
|
||||
def get_llama_tokenizer():
|
||||
from llama_models.llama3.api.chat_format import ChatFormat
|
||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||
|
||||
tokenizer = Tokenizer.get_instance()
|
||||
formatter = ChatFormat(tokenizer)
|
||||
return tokenizer, formatter
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
|
@ -213,6 +224,40 @@ def test_text_chat_completion_non_streaming(client_with_models, text_model_id, t
|
|||
assert expected.lower() in message_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:chat_completion:ttft",
|
||||
],
|
||||
)
|
||||
def test_text_chat_completion_first_token_profiling(client_with_models, text_model_id, test_case):
|
||||
tc = TestCase(test_case)
|
||||
|
||||
messages = tc["messages"]
|
||||
if os.environ.get("DEBUG_TTFT"): # debugging print number of tokens in input, ideally around 800
|
||||
from pydantic import TypeAdapter
|
||||
|
||||
from llama_stack.apis.inference import Message
|
||||
|
||||
tokenizer, formatter = get_llama_tokenizer()
|
||||
typed_messages = [TypeAdapter(Message).validate_python(m) for m in messages]
|
||||
encoded = formatter.encode_dialog_prompt(typed_messages, None)
|
||||
raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
|
||||
|
||||
response = client_with_models.inference.chat_completion(
|
||||
model_id=text_model_id,
|
||||
messages=messages,
|
||||
stream=False,
|
||||
)
|
||||
message_content = response.completion_message.content.lower().strip()
|
||||
assert len(message_content) > 0
|
||||
|
||||
if os.environ.get("DEBUG_TTFT"): # debugging print number of tokens in response, ideally around 150
|
||||
tokenizer, formatter = get_llama_tokenizer()
|
||||
encoded = formatter.encode_content(message_content)
|
||||
raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue