test_response_basic: test_name: test_response_basic test_params: case: - case_id: "earth" input: "Which planet do humans live on?" output: "earth" - case_id: "saturn" input: "Which planet has rings around it with a name starting with letter S?" output: "saturn" test_response_multi_turn: test_name: test_response_multi_turn test_params: case: - case_id: "earth" turns: - input: "Which planet do humans live on?" output: "earth" - input: "What is the name of the planet from your previous response?" output: "earth" test_response_web_search: test_name: test_response_web_search test_params: case: - case_id: "llama_experts" input: "How many experts does the Llama 4 Maverick model have?" tools: - type: web_search search_context_size: "low" output: "128" test_response_file_search: test_name: test_response_file_search test_params: case: - case_id: "llama_experts" input: "How many experts does the Llama 4 Maverick model have?" tools: - type: file_search # vector_store_ids param for file_search tool gets added by the test runner file_content: "Llama 4 Maverick has 128 experts" output: "128" - case_id: "llama_experts_pdf" input: "How many experts does the Llama 4 Maverick model have?" tools: - type: file_search # vector_store_ids param for file_search toolgets added by the test runner file_path: "pdfs/llama_stack_and_models.pdf" output: "128" test_response_mcp_tool: test_name: test_response_mcp_tool test_params: case: - case_id: "boiling_point_tool" input: "What is the boiling point of myawesomeliquid in Celsius?" tools: - type: mcp server_label: "localmcp" server_url: "" output: "Hello, world!" test_response_custom_tool: test_name: test_response_custom_tool test_params: case: - case_id: "sf_weather" input: "What's the weather like in San Francisco?" tools: - type: function name: get_weather description: Get current temperature for a given location. parameters: additionalProperties: false properties: location: description: "City and country e.g. Bogot\xE1, Colombia" type: string required: - location type: object test_response_image: test_name: test_response_image test_params: case: - case_id: "llama_image" input: - role: user content: - type: input_text text: "Identify the type of animal in this image." - type: input_image image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg" output: "llama" # the models are really poor at tool calling after seeing images :/ test_response_multi_turn_image: test_name: test_response_multi_turn_image test_params: case: - case_id: "llama_image_understanding" turns: - input: - role: user content: - type: input_text text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'." - type: input_image image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg" output: "llama" - input: "What country do you find this animal primarily in? What continent?" output: "peru" test_response_multi_turn_tool_execution: test_name: test_response_multi_turn_tool_execution test_params: case: - case_id: "user_file_access_check" input: "I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response." tools: - type: mcp server_label: "localmcp" server_url: "" output: "yes" - case_id: "experiment_results_lookup" input: "I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me what you found." tools: - type: mcp server_label: "localmcp" server_url: "" output: "100°C" test_response_multi_turn_tool_execution_streaming: test_name: test_response_multi_turn_tool_execution_streaming test_params: case: - case_id: "user_permissions_workflow" input: "Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step." tools: - type: mcp server_label: "localmcp" server_url: "" stream: true output: "no" - case_id: "experiment_analysis_streaming" input: "I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Please stream your analysis process." tools: - type: mcp server_label: "localmcp" server_url: "" stream: true output: "85%"