mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
This adds initial streaming support to the Responses API. This PR makes sure that the _first_ inference call made to chat completions streams out. There's more to be done: - tool call output tokens need to stream out when possible - we need to loop through multiple rounds of inference and they all need to stream out. ## Test Plan Added a test. Executed as: ``` FIREWORKS_API_KEY=... \ pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --provider=stack:fireworks --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` Then, started a llama stack fireworks distro and tested against it like this: ``` OPENAI_API_KEY=blah \ pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --base-url http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct ```
96 lines
2.8 KiB
YAML
96 lines
2.8 KiB
YAML
test_response_basic:
|
|
test_name: test_response_basic
|
|
test_params:
|
|
case:
|
|
- case_id: "earth"
|
|
input: "Which planet do humans live on?"
|
|
output: "earth"
|
|
- case_id: "saturn"
|
|
input: "Which planet has rings around it with a name starting with letter S?"
|
|
output: "saturn"
|
|
|
|
test_response_multi_turn:
|
|
test_name: test_response_multi_turn
|
|
test_params:
|
|
case:
|
|
- case_id: "earth"
|
|
turns:
|
|
- input: "Which planet do humans live on?"
|
|
output: "earth"
|
|
- input: "What is the name of the planet from your previous response?"
|
|
output: "earth"
|
|
|
|
test_response_web_search:
|
|
test_name: test_response_web_search
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_experts"
|
|
input: "How many experts does the Llama 4 Maverick model have?"
|
|
tools:
|
|
- type: web_search
|
|
search_context_size: "low"
|
|
output: "128"
|
|
|
|
test_response_mcp_tool:
|
|
test_name: test_response_mcp_tool
|
|
test_params:
|
|
case:
|
|
- case_id: "boiling_point_tool"
|
|
input: "What is the boiling point of polyjuice?"
|
|
tools:
|
|
- type: mcp
|
|
server_label: "localmcp"
|
|
server_url: "<FILLED_BY_TEST_RUNNER>"
|
|
output: "Hello, world!"
|
|
|
|
test_response_custom_tool:
|
|
test_name: test_response_custom_tool
|
|
test_params:
|
|
case:
|
|
- case_id: "sf_weather"
|
|
input: "What's the weather like in San Francisco?"
|
|
tools:
|
|
- type: function
|
|
name: get_weather
|
|
description: Get current temperature for a given location.
|
|
parameters:
|
|
additionalProperties: false
|
|
properties:
|
|
location:
|
|
description: "City and country e.g. Bogot\xE1, Colombia"
|
|
type: string
|
|
required:
|
|
- location
|
|
type: object
|
|
|
|
test_response_image:
|
|
test_name: test_response_image
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_image"
|
|
input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "Identify the type of animal in this image."
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
|
output: "llama"
|
|
|
|
# the models are really poor at tool calling after seeing images :/
|
|
test_response_multi_turn_image:
|
|
test_name: test_response_multi_turn_image
|
|
test_params:
|
|
case:
|
|
- case_id: "llama_image_understanding"
|
|
turns:
|
|
- input:
|
|
- role: user
|
|
content:
|
|
- type: input_text
|
|
text: "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'."
|
|
- type: input_image
|
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
|
output: "llama"
|
|
- input: "What country do you find this animal primarily in? What continent?"
|
|
output: "peru"
|