feat: D69478008 [llama-stack] turning tests into data-driven (#1180)

# What does this PR do?

We have several places running tests for different purposes.
- oss llama stack
  - provider tests
  - e2e tests
- provider llama stack
  - unit tests
  - e2e tests

It would be nice if they can *share the same set of test data*, so we
maintain the consistency between spec and implementation. This is what
this diff is about, isolating test data from test coding, so that we can
reuse the same data at different places by writing different test
coding.

## Test Plan

== Set up Ollama local server  
==  Run a provider test
conda activate stack

OLLAMA_URL="http://localhost:8321" \
pytest -v -s -k "ollama" --inference-model="llama3.2:3b-instruct-fp16" \

llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion_structured_output
// test_structured_output should also work

== Run an e2e test
conda activate sherpa
with-proxy pip install llama-stack
export INFERENCE_MODEL=llama3.2:3b-instruct-fp16
export LLAMA_STACK_PORT=8322
with-proxy llama stack build --template ollama
with-proxy llama stack run --env OLLAMA_URL=http://localhost:8321 ollama
  - Run test client,
LLAMA_STACK_PORT=8322 LLAMA_STACK_BASE_URL="http://localhost:8322" \
pytest -v -s --inference-model="llama3.2:3b-instruct-fp16" \

tests/client-sdk/inference/test_text_inference.py::test_text_completion_structured_output
// test_text_chat_completion_structured_output should also work

## Notes

- This PR was automatically generated by oss_sync
- Please refer to D69478008 for more details.
This commit is contained in:
LESSuseLESS 2025-02-20 14:13:06 -08:00 committed by GitHub
parent 1166afdf76
commit 2cbe9395b0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 123 additions and 47 deletions

View file

@ -6,7 +6,7 @@
import pytest
from pydantic import BaseModel, ValidationError
from pydantic import BaseModel, TypeAdapter, ValidationError
from llama_stack.apis.common.content_types import ToolCallParseStatus
from llama_stack.apis.inference import (
@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
CompletionResponseStreamChunk,
JsonSchemaResponseFormat,
LogProbConfig,
Message,
SystemMessage,
ToolChoice,
UserMessage,
@ -30,6 +31,7 @@ from llama_stack.models.llama.datatypes import (
ToolParamDefinition,
ToolPromptFormat,
)
from llama_stack.providers.tests.test_cases.test_case import TestCase
from .utils import group_chunks
@ -178,8 +180,9 @@ class TestInference:
else: # no token, no logprobs
assert not chunk.logprobs, "Logprobs should be empty"
@pytest.mark.parametrize("test_case", ["completion-01"])
@pytest.mark.asyncio(loop_scope="session")
async def test_completion_structured_output(self, inference_model, inference_stack):
async def test_completion_structured_output(self, inference_model, inference_stack, test_case):
inference_impl, _ = inference_stack
class Output(BaseModel):
@ -187,7 +190,9 @@ class TestInference:
year_born: str
year_retired: str
user_input = "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003."
tc = TestCase(test_case)
user_input = tc["user_input"]
response = await inference_impl.completion(
model_id=inference_model,
content=user_input,
@ -203,9 +208,10 @@ class TestInference:
assert isinstance(response.content, str)
answer = Output.model_validate_json(response.content)
assert answer.name == "Michael Jordan"
assert answer.year_born == "1963"
assert answer.year_retired == "2003"
expected = tc["expected"]
assert answer.name == expected["name"]
assert answer.year_born == expected["year_born"]
assert answer.year_retired == expected["year_retired"]
@pytest.mark.asyncio(loop_scope="session")
async def test_chat_completion_non_streaming(
@ -224,8 +230,9 @@ class TestInference:
assert isinstance(response.completion_message.content, str)
assert len(response.completion_message.content) > 0
@pytest.mark.parametrize("test_case", ["chat_completion-01"])
@pytest.mark.asyncio(loop_scope="session")
async def test_structured_output(self, inference_model, inference_stack, common_params):
async def test_structured_output(self, inference_model, inference_stack, common_params, test_case):
inference_impl, _ = inference_stack
class AnswerFormat(BaseModel):
@ -234,20 +241,12 @@ class TestInference:
year_of_birth: int
num_seasons_in_nba: int
tc = TestCase(test_case)
messages = [TypeAdapter(Message).validate_python(m) for m in tc["messages"]]
response = await inference_impl.chat_completion(
model_id=inference_model,
messages=[
# we include context about Michael Jordan in the prompt so that the test is
# focused on the funtionality of the model and not on the information embedded
# in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons.
SystemMessage(
content=(
"You are a helpful assistant.\n\n"
"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
)
),
UserMessage(content="Please give me information about Michael Jordan."),
],
messages=messages,
stream=False,
response_format=JsonSchemaResponseFormat(
json_schema=AnswerFormat.model_json_schema(),
@ -260,10 +259,11 @@ class TestInference:
assert isinstance(response.completion_message.content, str)
answer = AnswerFormat.model_validate_json(response.completion_message.content)
assert answer.first_name == "Michael"
assert answer.last_name == "Jordan"
assert answer.year_of_birth == 1963
assert answer.num_seasons_in_nba == 15
expected = tc["expected"]
assert answer.first_name == expected["first_name"]
assert answer.last_name == expected["last_name"]
assert answer.year_of_birth == expected["year_of_birth"]
assert answer.num_seasons_in_nba == expected["num_seasons_in_nba"]
response = await inference_impl.chat_completion(
model_id=inference_model,