feat(verification): various improvements (#1921)

# What does this PR do?
- provider and their models now live in config.yaml
- better distinguish different cases within a test
- add model key to surface provider's model_id
- include example command to rerun single test case

## Test Plan
<img width="1173" alt="image"
src="https://github.com/user-attachments/assets/b414baf0-c768-451f-8c3b-c2905cf36fac"
/>
This commit is contained in:
ehhuang 2025-04-10 10:26:19 -07:00 committed by GitHub
parent 09a83b1ec1
commit 14146e4b3f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 4449 additions and 8810 deletions

View file

@ -0,0 +1,133 @@
test_chat_basic:
test_name: test_chat_basic
test_params:
case:
- case_id: "earth"
input:
messages:
- content: Which planet do humans live on?
role: user
output: Earth
- case_id: "saturn"
input:
messages:
- content: Which planet has rings around it with a name starting with letter
S?
role: user
output: Saturn
test_chat_image:
test_name: test_chat_image
test_params:
case:
- input:
messages:
- content:
- text: What is in this image?
type: text
- image_url:
url: https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg
type: image_url
role: user
output: llama
test_chat_structured_output:
test_name: test_chat_structured_output
test_params:
case:
- case_id: "calendar"
input:
messages:
- content: Extract the event information.
role: system
- content: Alice and Bob are going to a science fair on Friday.
role: user
response_format:
json_schema:
name: calendar_event
schema:
properties:
date:
title: Date
type: string
name:
title: Name
type: string
participants:
items:
type: string
title: Participants
type: array
required:
- name
- date
- participants
title: CalendarEvent
type: object
type: json_schema
output: valid_calendar_event
- case_id: "math"
input:
messages:
- content: You are a helpful math tutor. Guide the user through the solution
step by step.
role: system
- content: how can I solve 8x + 7 = -23
role: user
response_format:
json_schema:
name: math_reasoning
schema:
$defs:
Step:
properties:
explanation:
title: Explanation
type: string
output:
title: Output
type: string
required:
- explanation
- output
title: Step
type: object
properties:
final_answer:
title: Final Answer
type: string
steps:
items:
$ref: '#/$defs/Step'
title: Steps
type: array
required:
- steps
- final_answer
title: MathReasoning
type: object
type: json_schema
output: valid_math_reasoning
test_tool_calling:
test_name: test_tool_calling
test_params:
case:
- input:
messages:
- content: You are a helpful assistant that can use tools to get information.
role: system
- content: What's the weather like in San Francisco?
role: user
tools:
- function:
description: Get current temperature for a given location.
name: get_weather
parameters:
additionalProperties: false
properties:
location:
description: "City and country e.g. Bogot\xE1, Colombia"
type: string
required:
- location
type: object
type: function
output: get_weather_tool_call