From 2cbe9395b017480422b31a1538eb3f42d66d880d Mon Sep 17 00:00:00 2001
From: LESSuseLESS <hzhao416@gmail.com>
Date: Thu, 20 Feb 2025 14:13:06 -0800
Subject: [PATCH] feat: D69478008 [llama-stack] turning tests into data-driven
 (#1180)

# What does this PR do?

We have several places running tests for different purposes.
- oss llama stack
  - provider tests
  - e2e tests
- provider llama stack
  - unit tests
  - e2e tests

It would be nice if they can *share the same set of test data*, so we
maintain the consistency between spec and implementation. This is what
this diff is about, isolating test data from test coding, so that we can
reuse the same data at different places by writing different test
coding.

## Test Plan

== Set up Ollama local server
==  Run a provider test
conda activate stack

OLLAMA_URL="http://localhost:8321" \
pytest -v -s -k "ollama" --inference-model="llama3.2:3b-instruct-fp16" \

llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion_structured_output
// test_structured_output should also work

== Run an e2e test
conda activate sherpa
with-proxy pip install llama-stack
export INFERENCE_MODEL=llama3.2:3b-instruct-fp16
export LLAMA_STACK_PORT=8322
with-proxy llama stack build --template ollama
with-proxy llama stack run --env OLLAMA_URL=http://localhost:8321 ollama
  - Run test client,
LLAMA_STACK_PORT=8322 LLAMA_STACK_BASE_URL="http://localhost:8322" \
pytest -v -s --inference-model="llama3.2:3b-instruct-fp16" \

tests/client-sdk/inference/test_text_inference.py::test_text_completion_structured_output
// test_text_chat_completion_structured_output should also work

## Notes

- This PR was automatically generated by oss_sync
- Please refer to D69478008 for more details.
---
 .../remote/inference/ollama/config.py         |  3 +-
 llama_stack/providers/tests/README.md         |  3 ++
 .../tests/inference/test_text_inference.py    | 46 +++++++++----------
 .../providers/tests/test_cases/__init__.py    |  5 ++
 .../tests/test_cases/chat_completion.json     | 24 ++++++++++
 .../tests/test_cases/completion.json          | 13 ++++++
 .../providers/tests/test_cases/test_case.py   | 32 +++++++++++++
 .../inference/test_text_inference.py          | 44 +++++++++---------
 8 files changed, 123 insertions(+), 47 deletions(-)
 create mode 100644 llama_stack/providers/tests/test_cases/__init__.py
 create mode 100644 llama_stack/providers/tests/test_cases/chat_completion.json
 create mode 100644 llama_stack/providers/tests/test_cases/completion.json
 create mode 100644 llama_stack/providers/tests/test_cases/test_case.py

diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py
index a5a4d48ab..4fc88df55 100644
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import os
 from typing import Any, Dict
 
 from pydantic import BaseModel
@@ -12,7 +13,7 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"
 
 
 class OllamaImplConfig(BaseModel):
-    url: str = DEFAULT_OLLAMA_URL
+    url: str = os.getenv("OLLAMA_URL", DEFAULT_OLLAMA_URL)
 
     @classmethod
     def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> Dict[str, Any]:
diff --git a/llama_stack/providers/tests/README.md b/llama_stack/providers/tests/README.md
index f68c988f8..f2c527f6d 100644
--- a/llama_stack/providers/tests/README.md
+++ b/llama_stack/providers/tests/README.md
@@ -104,3 +104,6 @@ pytest llama_stack/providers/tests/ --config=ci_test_config.yaml
 Currently, we support test config on inference, agents and memory api tests.
 
 Example format of test config can be found in ci_test_config.yaml.
+
+## Test Data
+We encourage providers to use our test data for internal development testing, so to make it easier and consistent with the tests we provide. Each test case may define its own data format, and please refer to our test source code to get details on how these fields are used in the test.
diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py
index f25b95004..1a384cc91 100644
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@@ -6,7 +6,7 @@
 
 
 import pytest
-from pydantic import BaseModel, ValidationError
+from pydantic import BaseModel, TypeAdapter, ValidationError
 
 from llama_stack.apis.common.content_types import ToolCallParseStatus
 from llama_stack.apis.inference import (
@@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
     CompletionResponseStreamChunk,
     JsonSchemaResponseFormat,
     LogProbConfig,
+    Message,
     SystemMessage,
     ToolChoice,
     UserMessage,
@@ -30,6 +31,7 @@ from llama_stack.models.llama.datatypes import (
     ToolParamDefinition,
     ToolPromptFormat,
 )
+from llama_stack.providers.tests.test_cases.test_case import TestCase
 
 from .utils import group_chunks
 
@@ -178,8 +180,9 @@ class TestInference:
             else:  # no token, no logprobs
                 assert not chunk.logprobs, "Logprobs should be empty"
 
+    @pytest.mark.parametrize("test_case", ["completion-01"])
     @pytest.mark.asyncio(loop_scope="session")
-    async def test_completion_structured_output(self, inference_model, inference_stack):
+    async def test_completion_structured_output(self, inference_model, inference_stack, test_case):
         inference_impl, _ = inference_stack
 
         class Output(BaseModel):
@@ -187,7 +190,9 @@ class TestInference:
             year_born: str
             year_retired: str
 
-        user_input = "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003."
+        tc = TestCase(test_case)
+
+        user_input = tc["user_input"]
         response = await inference_impl.completion(
             model_id=inference_model,
             content=user_input,
@@ -203,9 +208,10 @@ class TestInference:
         assert isinstance(response.content, str)
 
         answer = Output.model_validate_json(response.content)
-        assert answer.name == "Michael Jordan"
-        assert answer.year_born == "1963"
-        assert answer.year_retired == "2003"
+        expected = tc["expected"]
+        assert answer.name == expected["name"]
+        assert answer.year_born == expected["year_born"]
+        assert answer.year_retired == expected["year_retired"]
 
     @pytest.mark.asyncio(loop_scope="session")
     async def test_chat_completion_non_streaming(
@@ -224,8 +230,9 @@ class TestInference:
         assert isinstance(response.completion_message.content, str)
         assert len(response.completion_message.content) > 0
 
+    @pytest.mark.parametrize("test_case", ["chat_completion-01"])
     @pytest.mark.asyncio(loop_scope="session")
-    async def test_structured_output(self, inference_model, inference_stack, common_params):
+    async def test_structured_output(self, inference_model, inference_stack, common_params, test_case):
         inference_impl, _ = inference_stack
 
         class AnswerFormat(BaseModel):
@@ -234,20 +241,12 @@ class TestInference:
             year_of_birth: int
             num_seasons_in_nba: int
 
+        tc = TestCase(test_case)
+        messages = [TypeAdapter(Message).validate_python(m) for m in tc["messages"]]
+
         response = await inference_impl.chat_completion(
             model_id=inference_model,
-            messages=[
-                # we include context about Michael Jordan in the prompt so that the test is
-                # focused on the funtionality of the model and not on the information embedded
-                # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons.
-                SystemMessage(
-                    content=(
-                        "You are a helpful assistant.\n\n"
-                        "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
-                    )
-                ),
-                UserMessage(content="Please give me information about Michael Jordan."),
-            ],
+            messages=messages,
             stream=False,
             response_format=JsonSchemaResponseFormat(
                 json_schema=AnswerFormat.model_json_schema(),
@@ -260,10 +259,11 @@ class TestInference:
         assert isinstance(response.completion_message.content, str)
 
         answer = AnswerFormat.model_validate_json(response.completion_message.content)
-        assert answer.first_name == "Michael"
-        assert answer.last_name == "Jordan"
-        assert answer.year_of_birth == 1963
-        assert answer.num_seasons_in_nba == 15
+        expected = tc["expected"]
+        assert answer.first_name == expected["first_name"]
+        assert answer.last_name == expected["last_name"]
+        assert answer.year_of_birth == expected["year_of_birth"]
+        assert answer.num_seasons_in_nba == expected["num_seasons_in_nba"]
 
         response = await inference_impl.chat_completion(
             model_id=inference_model,
diff --git a/llama_stack/providers/tests/test_cases/__init__.py b/llama_stack/providers/tests/test_cases/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/tests/test_cases/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/tests/test_cases/chat_completion.json b/llama_stack/providers/tests/test_cases/chat_completion.json
new file mode 100644
index 000000000..cb8854e15
--- /dev/null
+++ b/llama_stack/providers/tests/test_cases/chat_completion.json
@@ -0,0 +1,24 @@
+{
+    "01": {
+        "name": "structured output",
+        "data": {
+            "notes": "We include context about Michael Jordan in the prompt so that the test is focused on the funtionality of the model and not on the information embedded in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons.",
+            "messages": [
+              {
+                "role": "system",
+                "content": "You are a helpful assistant. Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
+              },
+              {
+                "role": "user",
+                "content": "Please give me information about Michael Jordan."
+              }
+            ],
+            "expected": {
+                "first_name": "Michael",
+                "last_name": "Jordan",
+                "year_of_birth": 1963,
+                "num_seasons_in_nba": 15
+            }
+        }
+    }
+}
diff --git a/llama_stack/providers/tests/test_cases/completion.json b/llama_stack/providers/tests/test_cases/completion.json
new file mode 100644
index 000000000..1e968e45e
--- /dev/null
+++ b/llama_stack/providers/tests/test_cases/completion.json
@@ -0,0 +1,13 @@
+{
+    "01": {
+        "name": "structured output",
+        "data": {
+            "user_input": "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003.",
+            "expected": {
+                "name": "Michael Jordan",
+                "year_born": "1963",
+                "year_retired": "2003"
+            }
+        }
+    }
+}
diff --git a/llama_stack/providers/tests/test_cases/test_case.py b/llama_stack/providers/tests/test_cases/test_case.py
new file mode 100644
index 000000000..7bd9b4d56
--- /dev/null
+++ b/llama_stack/providers/tests/test_cases/test_case.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import pathlib
+
+
+class TestCase:
+    _apis = ["chat_completion", "completion"]
+    _jsonblob = {}
+
+    def __init__(self, name):
+        # loading all test cases
+        if self._jsonblob == {}:
+            for api in self._apis:
+                with open(pathlib.Path(__file__).parent / f"{api}.json", "r") as f:
+                    TestCase._jsonblob.update({f"{api}-{k}": v for k, v in json.load(f).items()})
+
+        # loading this test case
+        tc = self._jsonblob.get(name)
+        if tc is None:
+            raise ValueError(f"Test case {name} not found")
+
+        # these are the only fields we need
+        self.name = tc.get("name")
+        self.data = tc.get("data")
+
+    def __getitem__(self, key):
+        return self.data[key]
diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py
index 6a113c463..1fe53ab86 100644
--- a/tests/client-sdk/inference/test_text_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@@ -7,6 +7,8 @@
 import pytest
 from pydantic import BaseModel
 
+from llama_stack.providers.tests.test_cases.test_case import TestCase
+
 PROVIDER_TOOL_PROMPT_FORMAT = {
     "remote::ollama": "json",
     "remote::together": "json",
@@ -120,16 +122,16 @@ def test_completion_log_probs_streaming(llama_stack_client, text_model_id, infer
             assert not chunk.logprobs, "Logprobs should be empty"
 
 
-def test_text_completion_structured_output(llama_stack_client, text_model_id, inference_provider_type):
-    user_input = """
-    Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003.
-    """
-
+@pytest.mark.parametrize("test_case", ["completion-01"])
+def test_text_completion_structured_output(llama_stack_client, text_model_id, inference_provider_type, test_case):
     class AnswerFormat(BaseModel):
         name: str
         year_born: str
         year_retired: str
 
+    tc = TestCase(test_case)
+
+    user_input = tc["user_input"]
     response = llama_stack_client.inference.completion(
         model_id=text_model_id,
         content=user_input,
@@ -143,9 +145,10 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in
         },
     )
     answer = AnswerFormat.model_validate_json(response.content)
-    assert answer.name == "Michael Jordan"
-    assert answer.year_born == "1963"
-    assert answer.year_retired == "2003"
+    expected = tc["expected"]
+    assert answer.name == expected["name"]
+    assert answer.year_born == expected["year_born"]
+    assert answer.year_retired == expected["year_retired"]
 
 
 @pytest.mark.parametrize(
@@ -247,6 +250,7 @@ def test_text_chat_completion_with_tool_calling_and_streaming(
     assert tool_invocation_content == "[get_weather, {'location': 'San Francisco, CA'}]"
 
 
+@pytest.mark.parametrize("test_case", ["chat_completion-01"])
 def test_text_chat_completion_with_tool_choice_required(
     llama_stack_client, text_model_id, get_weather_tool_definition, provider_tool_format, inference_provider_type
 ):
@@ -281,25 +285,18 @@ def test_text_chat_completion_with_tool_choice_none(
     assert tool_invocation_content == ""
 
 
-def test_text_chat_completion_structured_output(llama_stack_client, text_model_id, inference_provider_type):
+def test_text_chat_completion_structured_output(llama_stack_client, text_model_id, inference_provider_type, test_case):
     class AnswerFormat(BaseModel):
         first_name: str
         last_name: str
         year_of_birth: int
         num_seasons_in_nba: int
 
+    tc = TestCase(test_case)
+
     response = llama_stack_client.inference.chat_completion(
         model_id=text_model_id,
-        messages=[
-            {
-                "role": "system",
-                "content": "You are a helpful assistant. Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons.",
-            },
-            {
-                "role": "user",
-                "content": "Please give me information about Michael Jordan.",
-            },
-        ],
+        messages=tc["messages"],
         response_format={
             "type": "json_schema",
             "json_schema": AnswerFormat.model_json_schema(),
@@ -307,10 +304,11 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i
         stream=False,
     )
     answer = AnswerFormat.model_validate_json(response.completion_message.content)
-    assert answer.first_name == "Michael"
-    assert answer.last_name == "Jordan"
-    assert answer.year_of_birth == 1963
-    assert answer.num_seasons_in_nba == 15
+    expected = tc["expected"]
+    assert answer.first_name == expected["first_name"]
+    assert answer.last_name == expected["last_name"]
+    assert answer.year_of_birth == expected["year_of_birth"]
+    assert answer.num_seasons_in_nba == expected["num_seasons_in_nba"]
 
 
 @pytest.mark.parametrize(