mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-23 21:04:29 +00:00
This gets the file_search verification test working against ollama, fireworks, and api.openai.com. We don't have the entirety of the vector store API implemented in Llama Stack yet, so this still has a bit of a hack to swap between using only OpenAI-compatible APIs versus using the LlamaStackClient to insert content into our vector stores. Outside of actually inserting file contents, the rest of the test works the same and uses only the OpenAI client for all of these providers. How to run the tests: Ollama (sometimes flakes with small model): ``` ollama run llama3.2:3b INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run ./llama_stack/templates/ollama/run.yaml \ --image-type venv \ --env OLLAMA_URL="http://0.0.0.0:11434" pytest -sv \ 'tests/verifications/openai_api/test_responses.py::test_response_non_streaming_file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-3.2-3B-Instruct ``` Fireworks via Llama Stack: ``` llama stack run llama_stack/templates/fireworks/run.yaml pytest -sv \ 'tests/verifications/openai_api/test_responses.py::test_response_non_streaming_file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-3.3-70B-Instruct ``` OpenAI directly: ``` pytest -sv \ 'tests/verifications/openai_api/test_responses.py::test_response_non_streaming_file_search' \ --base-url=https://api.openai.com/v1 \ --model gpt-4o ``` Signed-off-by: Ben Browning <bbrownin@redhat.com>
147 lines
5.1 KiB
YAML
147 lines
5.1 KiB
YAML
test_response_basic:
|
|
test_name: test_response_basic
|
|
test_params:
|
|
case:
|
|
- case_id: "earth"
|
|
input: "Which planet do humans live on?"
|
|
output: "earth"
|
|
- case_id: "saturn"
|
|
input: "Which planet has rings around it with a name starting with letter S?"
|
|
output: "saturn"
|
|
|
|
test_response_multi_turn:
|
|
test_name: test_response_multi_turn
|
|
test_params:
|
|
case:
|
|
- case_id: "earth"
|
|
turns:
|
|
- input: "Which planet do humans live on?"
|
|
output: "earth"
|
|
- input: "What is the name of the planet from your previous response?"
|
|
output: "earth"
|
|
|
|
test_response_web_search:
|
|
test_name: test_response_web_search
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_experts"
|
|
input: "How many experts does the Llama 4 Maverick model have?"
|
|
tools:
|
|
- type: web_search
|
|
search_context_size: "low"
|
|
output: "128"
|
|
|
|
test_response_file_search:
|
|
test_name: test_response_file_search
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_experts"
|
|
input: "How many experts does the Llama 4 Maverick model have?"
|
|
tools:
|
|
- type: file_search
|
|
# vector_store_ids gets added by the test runner
|
|
output: "128"
|
|
|
|
test_response_mcp_tool:
|
|
test_name: test_response_mcp_tool
|
|
test_params:
|
|
case:
|
|
- case_id: "boiling_point_tool"
|
|
input: "What is the boiling point of myawesomeliquid in Celsius?"
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "Hello, world!"
|
|
|
|
test_response_custom_tool:
|
|
test_name: test_response_custom_tool
|
|
test_params:
|
|
case:
|
|
- case_id: "sf_weather"
|
|
input: "What's the weather like in San Francisco?"
|
|
tools:
|
|
- type: function
|
|
name: get_weather
|
|
description: Get current temperature for a given location.
|
|
parameters:
|
|
additionalProperties: false
|
|
properties:
|
|
location:
|
|
description: "City and country e.g. Bogot\xE1, Colombia"
|
|
type: string
|
|
required:
|
|
- location
|
|
type: object
|
|
|
|
test_response_image:
|
|
test_name: test_response_image
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_image"
|
|
input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "Identify the type of animal in this image."
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
|
output: "llama"
|
|
|
|
# the models are really poor at tool calling after seeing images :/
|
|
test_response_multi_turn_image:
|
|
test_name: test_response_multi_turn_image
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_image_understanding"
|
|
turns:
|
|
- input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'."
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
|
output: "llama"
|
|
- input: "What country do you find this animal primarily in? What continent?"
|
|
output: "peru"
|
|
|
|
test_response_multi_turn_tool_execution:
|
|
test_name: test_response_multi_turn_tool_execution
|
|
test_params:
|
|
case:
|
|
- case_id: "user_file_access_check"
|
|
input: "I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "yes"
|
|
- case_id: "experiment_results_lookup"
|
|
input: "I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me what you found."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "100°C"
|
|
|
|
test_response_multi_turn_tool_execution_streaming:
|
|
test_name: test_response_multi_turn_tool_execution_streaming
|
|
test_params:
|
|
case:
|
|
- case_id: "user_permissions_workflow"
|
|
input: "Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
stream: true
|
|
output: "no"
|
|
- case_id: "experiment_analysis_streaming"
|
|
input: "I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Please stream your analysis process."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
stream: true
|
|
output: "85%"
|