From 435f34b05e84f1747b28570234f25878cf0b31c4 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 3 Dec 2024 05:55:14 -0500 Subject: [PATCH] reduce the accuracy requirements to pass the chat completion structured output test (#522) i find `test_structured_output` to be flakey. it's both a functionality and accuracy test - ``` answer = AnswerFormat.model_validate_json(response.completion_message.content) assert answer.first_name == "Michael" assert answer.last_name == "Jordan" assert answer.year_of_birth == 1963 assert answer.num_seasons_in_nba == 15 ``` it's an accuracy test because it checks the value of first/last name, birth year, and num seasons. i find that - - llama-3.1-8b-instruct and llama-3.2-3b-instruct pass the functionality portion - llama-3.2-3b-instruct consistently fails the accuracy portion (thinking MJ was in the NBA for 14 seasons) - llama-3.1-8b-instruct occasionally fails the accuracy portion suggestions (not mutually exclusive) - 1. turn the test into functionality only, skip the value checks 2. split the test into a functionality version and an xfail accuracy version 3. add context to the prompt so the llm can answer without accessing embedded memory # What does this PR do? implements option (3) by adding context to the system prompt. ## Test Plan `pytest -s -v ... llama_stack/providers/tests/inference/ ... -k structured_output` ## Before submitting - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [x] Wrote necessary unit or integration tests. --- .../providers/tests/inference/test_text_inference.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index f0f1d0eb2..9e5c67375 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -211,7 +211,15 @@ class TestInference: response = await inference_impl.chat_completion( model_id=inference_model, messages=[ - SystemMessage(content="You are a helpful assistant."), + # we include context about Michael Jordan in the prompt so that the test is + # focused on the funtionality of the model and not on the information embedded + # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons. + SystemMessage( + content=( + "You are a helpful assistant.\n\n" + "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons." + ) + ), UserMessage(content="Please give me information about Michael Jordan."), ], stream=False,