mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-22 20:43:59 +00:00
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 2s
Integration Tests / test-matrix (http, 3.12, datasets) (push) Failing after 5s
Integration Tests / test-matrix (http, 3.12, scoring) (push) Failing after 5s
Integration Tests / test-matrix (http, 3.13, datasets) (push) Failing after 7s
Integration Tests / test-matrix (http, 3.13, providers) (push) Failing after 7s
Integration Tests / test-matrix (http, 3.12, inspect) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, inference) (push) Failing after 14s
Integration Tests / test-matrix (http, 3.12, tool_runtime) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.12, vector_io) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.13, inference) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.12, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 8s
Integration Tests / test-matrix (http, 3.13, inspect) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.13, tool_runtime) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.13, scoring) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 6s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 10s
Integration Tests / test-matrix (http, 3.12, agents) (push) Failing after 21s
Integration Tests / test-matrix (http, 3.12, providers) (push) Failing after 19s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 7s
Integration Tests / test-matrix (http, 3.13, post_training) (push) Failing after 16s
Integration Tests / test-matrix (http, 3.13, agents) (push) Failing after 19s
Integration Tests / test-matrix (http, 3.13, vector_io) (push) Failing after 15s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 9s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 10s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 13s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 11s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 8s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 46s
Python Package Build Test / build (3.12) (push) Failing after 43s
Test External Providers / test-external-providers (venv) (push) Failing after 40s
Python Package Build Test / build (3.13) (push) Failing after 42s
Unit Tests / unit-tests (3.13) (push) Failing after 22s
Unit Tests / unit-tests (3.12) (push) Failing after 25s
Update ReadTheDocs / update-readthedocs (push) Failing after 20s
Pre-commit / pre-commit (push) Successful in 2m13s
## What does this PR do? Ollama does not support remote images. Only local file paths OR base64 inputs are supported. This PR ensures that the Stack downloads remote images and passes the base64 down to the inference engine. ## Test Plan Added a test cases for Responses and ran it for both `fireworks` and `ollama` providers.
166 lines
5.8 KiB
YAML
166 lines
5.8 KiB
YAML
test_response_basic:
|
|
test_name: test_response_basic
|
|
test_params:
|
|
case:
|
|
- case_id: "earth"
|
|
input: "Which planet do humans live on?"
|
|
output: "earth"
|
|
- case_id: "saturn"
|
|
input: "Which planet has rings around it with a name starting with letter S?"
|
|
output: "saturn"
|
|
- case_id: "image_input"
|
|
input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "what teams are playing in this image?"
|
|
- role: user
|
|
content:
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg"
|
|
output: "brooklyn nets"
|
|
|
|
test_response_multi_turn:
|
|
test_name: test_response_multi_turn
|
|
test_params:
|
|
case:
|
|
- case_id: "earth"
|
|
turns:
|
|
- input: "Which planet do humans live on?"
|
|
output: "earth"
|
|
- input: "What is the name of the planet from your previous response?"
|
|
output: "earth"
|
|
|
|
test_response_web_search:
|
|
test_name: test_response_web_search
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_experts"
|
|
input: "How many experts does the Llama 4 Maverick model have?"
|
|
tools:
|
|
- type: web_search
|
|
search_context_size: "low"
|
|
output: "128"
|
|
|
|
test_response_file_search:
|
|
test_name: test_response_file_search
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_experts"
|
|
input: "How many experts does the Llama 4 Maverick model have?"
|
|
tools:
|
|
- type: file_search
|
|
# vector_store_ids param for file_search tool gets added by the test runner
|
|
file_content: "Llama 4 Maverick has 128 experts"
|
|
output: "128"
|
|
- case_id: "llama_experts_pdf"
|
|
input: "How many experts does the Llama 4 Maverick model have?"
|
|
tools:
|
|
- type: file_search
|
|
# vector_store_ids param for file_search toolgets added by the test runner
|
|
file_path: "pdfs/llama_stack_and_models.pdf"
|
|
output: "128"
|
|
|
|
test_response_mcp_tool:
|
|
test_name: test_response_mcp_tool
|
|
test_params:
|
|
case:
|
|
- case_id: "boiling_point_tool"
|
|
input: "What is the boiling point of myawesomeliquid in Celsius?"
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "Hello, world!"
|
|
|
|
test_response_custom_tool:
|
|
test_name: test_response_custom_tool
|
|
test_params:
|
|
case:
|
|
- case_id: "sf_weather"
|
|
input: "What's the weather like in San Francisco?"
|
|
tools:
|
|
- type: function
|
|
name: get_weather
|
|
description: Get current temperature for a given location.
|
|
parameters:
|
|
additionalProperties: false
|
|
properties:
|
|
location:
|
|
description: "City and country e.g. Bogot\xE1, Colombia"
|
|
type: string
|
|
required:
|
|
- location
|
|
type: object
|
|
|
|
test_response_image:
|
|
test_name: test_response_image
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_image"
|
|
input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "Identify the type of animal in this image."
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
|
output: "llama"
|
|
|
|
# the models are really poor at tool calling after seeing images :/
|
|
test_response_multi_turn_image:
|
|
test_name: test_response_multi_turn_image
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_image_understanding"
|
|
turns:
|
|
- input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'."
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
|
output: "llama"
|
|
- input: "What country do you find this animal primarily in? What continent?"
|
|
output: "peru"
|
|
|
|
test_response_multi_turn_tool_execution:
|
|
test_name: test_response_multi_turn_tool_execution
|
|
test_params:
|
|
case:
|
|
- case_id: "user_file_access_check"
|
|
input: "I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "yes"
|
|
- case_id: "experiment_results_lookup"
|
|
input: "I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me what you found."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "100°C"
|
|
|
|
test_response_multi_turn_tool_execution_streaming:
|
|
test_name: test_response_multi_turn_tool_execution_streaming
|
|
test_params:
|
|
case:
|
|
- case_id: "user_permissions_workflow"
|
|
input: "Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
stream: true
|
|
output: "no"
|
|
- case_id: "experiment_analysis_streaming"
|
|
input: "I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Please stream your analysis process."
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
stream: true
|
|
output: "85%"
|