test_response_basic: test_name: test_response_basic test_params: case: - case_id: "earth" input: "Which planet do humans live on?" output: "earth" - case_id: "saturn" input: "Which planet has rings around it with a name starting with letter S?" output: "saturn" test_response_multi_turn: test_name: test_response_multi_turn test_params: case: - case_id: "earth" turns: - input: "Which planet do humans live on?" output: "earth" - input: "What is the name of the planet from your previous response?" output: "earth" test_response_web_search: test_name: test_response_web_search test_params: case: - case_id: "llama_experts" input: "How many experts does the Llama 4 Maverick model have?" tools: - type: web_search search_context_size: "low" output: "128" test_response_mcp_tool: test_name: test_response_mcp_tool test_params: case: - case_id: "boiling_point_tool" input: "What is the boiling point of polyjuice?" tools: - type: mcp server_label: "localmcp" server_url: "" output: "Hello, world!" test_response_custom_tool: test_name: test_response_custom_tool test_params: case: - case_id: "sf_weather" input: "What's the weather like in San Francisco?" tools: - type: function name: get_weather description: Get current temperature for a given location. parameters: additionalProperties: false properties: location: description: "City and country e.g. Bogot\xE1, Colombia" type: string required: - location type: object test_response_image: test_name: test_response_image test_params: case: - case_id: "llama_image" input: - role: user content: - type: input_text text: "Identify the type of animal in this image." - type: input_image image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg" output: "llama" # the models are really poor at tool calling after seeing images :/ test_response_multi_turn_image: test_name: test_response_multi_turn_image test_params: case: - case_id: "llama_image_understanding" turns: - input: - role: user content: - type: input_text text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'." - type: input_image image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg" output: "llama" - input: "What country do you find this animal primarily in? What continent?" output: "peru" test_response_multi_turn_tool_execution: test_name: test_response_multi_turn_tool_execution test_params: case: - case_id: "user_file_access_check" input: "I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Tell me the final result." tools: - type: mcp server_label: "localmcp" server_url: "" output: "yes" - case_id: "experiment_results_lookup" input: "I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me what you found." tools: - type: mcp server_label: "localmcp" server_url: "" output: "100°C" test_response_multi_turn_tool_execution_streaming: test_name: test_response_multi_turn_tool_execution_streaming test_params: case: - case_id: "user_permissions_workflow" input: "Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step." tools: - type: mcp server_label: "localmcp" server_url: "" stream: true output: "no" - case_id: "experiment_analysis_streaming" input: "I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Please stream your analysis process." tools: - type: mcp server_label: "localmcp" server_url: "" stream: true output: "85%"