Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-28 04:21:58 +00:00 · 2025-04-07 08:52:51 -05:00 · 2025-04-07 08:52:51 -05:00 · ec73b3d066
commit ec73b3d066
parent 085cc7beed c52ccc4bbd
95 changed files with 206742 additions and 6573 deletions
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@ -8,6 +8,7 @@ from typing import Any, Dict
 from uuid import uuid4

 import pytest
+import requests
 from llama_stack_client import Agent, AgentEventLogger, Document
 from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig

@ -21,7 +22,7 @@ from llama_stack.apis.agents.agents import (

 def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
    """
-    Returns the boiling point of a liquid in Celcius or Fahrenheit
+    Returns the boiling point of a liquid in Celcius or Fahrenheit.

    :param liquid_name: The name of the liquid
    :param celcius: Whether to return the boiling point in Celcius
@ -185,7 +186,7 @@ def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent
        messages=[
            {
                "role": "user",
-                "content": "Search the web and tell me what is the local time in Tokyo currently.",
+                "content": "Who are the latest board members to join Meta's board of directors?",
            }
        ],
        session_id=session_id,
@ -429,19 +430,28 @@ def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_t


 def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config):
-    urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
+    urls = ["llama3.rst", "lora_finetune.rst"]
    documents = [
+        # passign as url
        Document(
-            document_id=f"num-{i}",
-            content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
+            document_id="num-0",
+            content={
+                "type": "url",
+                "uri": f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{urls[0]}",
+            },
            mime_type="text/plain",
            metadata={},
-        )
-        for i, url in enumerate(urls)
+        ),
+        # passing as str
+        Document(
+            document_id="num-1",
+            content=requests.get(
+                f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{urls[1]}"
+            ).text[:500],
+            mime_type="text/plain",
+            metadata={},
+        ),
    ]
-    agent_config = {
-        **agent_config,
-    }
    rag_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
    session_id = rag_agent.create_session(f"test-session-{uuid4()}")
    user_prompts = [
@ -456,7 +466,7 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
            documents,
        ),
        (
-            "Tell me how to use LoRA",
+            "Tell me how to use LoRA in 100 words or less",
            None,
        ),
    ]
@ -478,6 +488,9 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag


 def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_config):
+    if "llama-4" in agent_config["model"].lower():
+        pytest.xfail("Not working for llama4")
+
    documents = []
    documents.append(
        Document(
@ -544,7 +557,7 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
            stream=False,
        )
        tool_execution_step = next(step for step in response.steps if step.step_type == "tool_execution")
-        assert tool_execution_step.tool_calls[0].tool_name == tool_name
+        assert tool_execution_step.tool_calls[0].tool_name == tool_name, f"Failed on {prompt}"
        if expected_kw:
            assert expected_kw in response.output_message.content.lower()

@ -565,18 +578,22 @@ def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_co
    agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
    session_id = agent.create_session(f"test-session-{uuid4()}")

+    input_prompt = f"Call {client_tools[0].__name__} tool and answer What is the boiling point of polyjuice?"
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
-                "content": "Call get_boiling_point and answer What is the boiling point of polyjuice?",
+                "content": input_prompt,
            },
        ],
        session_id=session_id,
        stream=False,
    )
+    assert len(response.input_messages) == 1
+    assert input_prompt == response.input_messages[0].content
+
    steps = response.steps
-    assert len(steps) == 3
+    assert len(steps) >= 3  # some models call the tool twice
    assert steps[0].step_type == "inference"
    assert steps[1].step_type == "tool_execution"
    assert steps[1].tool_calls[0].tool_name.startswith("get_boiling_point")
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -506,3 +506,80 @@ def test_text_chat_completion_tool_calling_tools_not_in_request(
    else:
        for tc in response.completion_message.tool_calls:
            assert tc.tool_name == "get_object_namespace_list"
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        # Tests if the model can handle simple messages like "Hi" or
+        # a message unrelated to one of the tool calls
+        "inference:chat_completion:multi_turn_tool_calling_01",
+        # Tests if the model can do full tool call with responses correctly
+        "inference:chat_completion:multi_turn_tool_calling_02",
+        # Tests if model can generate multiple params and
+        # read outputs correctly
+        "inference:chat_completion:multi_turn_tool_calling_03",
+        # Tests if model can do different tool calls in a seqeunce
+        # and use the information between appropriately
+        "inference:chat_completion:multi_turn_tool_calling_04",
+        # Tests if model can use current date and run multiple tool calls
+        # sequentially and infer using both
+        "inference:chat_completion:multi_turn_tool_calling_05",
+    ],
+)
+def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case):
+    """This test tests the model's tool calling loop in various scenarios"""
+    if "llama-4" not in text_model_id.lower():
+        pytest.xfail("Not tested for non-llama4 models yet")
+
+    tc = TestCase(test_case)
+    messages = []
+
+    # keep going until either
+    # 1. we have messages to test in multi-turn
+    # 2. no messages bust last message is tool response
+    while len(tc["messages"]) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
+        # do not take new messages if last message is tool response
+        if len(messages) == 0 or messages[-1]["role"] != "tool":
+            new_messages = tc["messages"].pop(0)
+            messages += new_messages
+
+        # pprint(messages)
+        response = client_with_models.inference.chat_completion(
+            model_id=text_model_id,
+            messages=messages,
+            tools=tc["tools"],
+            stream=False,
+            sampling_params={
+                "strategy": {
+                    "type": "top_p",
+                    "top_p": 0.9,
+                    "temperature": 0.6,
+                }
+            },
+        )
+        op_msg = response.completion_message
+        messages.append(op_msg.model_dump())
+        # pprint(op_msg)
+
+        assert op_msg.role == "assistant"
+        expected = tc["expected"].pop(0)
+        assert len(op_msg.tool_calls) == expected["num_tool_calls"]
+
+        if expected["num_tool_calls"] > 0:
+            assert op_msg.tool_calls[0].tool_name == expected["tool_name"]
+            assert op_msg.tool_calls[0].arguments == expected["tool_arguments"]
+
+            tool_response = tc["tool_responses"].pop(0)
+            messages.append(
+                # Tool Response Message
+                {
+                    "role": "tool",
+                    "call_id": op_msg.tool_calls[0].call_id,
+                    "content": tool_response["response"],
+                }
+            )
+        else:
+            actual_answer = op_msg.content.lower()
+            # pprint(actual_answer)
+            assert expected["answer"] in actual_answer
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@ -4,11 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+
 import base64
 import pathlib
+from pathlib import Path

 import pytest

+THIS_DIR = Path(__file__).parent
+

@pytest.fixture
 def image_path():
@ -27,7 +31,6 @@ def base64_image_url(base64_image_data, image_path):
    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"


-@pytest.mark.xfail(reason="This test is failing because the image is not being downloaded correctly.")
 def test_image_chat_completion_non_streaming(client_with_models, vision_model_id):
    message = {
        "role": "user",
@ -56,7 +59,99 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})


-@pytest.mark.xfail(reason="This test is failing because the image is not being downloaded correctly.")
+@pytest.fixture
+def multi_image_data():
+    files = [
+        THIS_DIR / "vision_test_1.jpg",
+        THIS_DIR / "vision_test_2.jpg",
+        THIS_DIR / "vision_test_3.jpg",
+    ]
+    encoded_files = []
+    for file in files:
+        with open(file, "rb") as image_file:
+            base64_data = base64.b64encode(image_file.read()).decode("utf-8")
+            encoded_files.append(base64_data)
+    return encoded_files
+
+
+@pytest.mark.parametrize("stream", [True, False])
+def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_data, stream):
+    if "llama-4" not in vision_model_id.lower() and "gpt-4o" not in vision_model_id.lower():
+        pytest.skip("Skip for non-llama4, gpt4o models")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": {
+                        "data": multi_image_data[0],
+                    },
+                },
+                {
+                    "type": "image",
+                    "image": {
+                        "data": multi_image_data[1],
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": "What are the differences between these images? Where would you assume they would be located?",
+                },
+            ],
+        },
+    ]
+    response = client_with_models.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=messages,
+        stream=stream,
+    )
+    if stream:
+        message_content = ""
+        for chunk in response:
+            message_content += chunk.event.delta.text
+    else:
+        message_content = response.completion_message.content
+    assert len(message_content) > 0
+    assert any(expected in message_content.lower().strip() for expected in {"bedroom"}), message_content
+
+    messages.append(
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": message_content}],
+            "stop_reason": "end_of_turn",
+        }
+    )
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": {
+                        "data": multi_image_data[2],
+                    },
+                },
+                {"type": "text", "text": "How about this one?"},
+            ],
+        },
+    )
+    response = client_with_models.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=messages,
+        stream=stream,
+    )
+    if stream:
+        message_content = ""
+        for chunk in response:
+            message_content += chunk.event.delta.text
+    else:
+        message_content = response.completion_message.content
+    assert len(message_content) > 0
+    assert any(expected in message_content.lower().strip() for expected in {"sword", "shield"}), message_content
+
+
 def test_image_chat_completion_streaming(client_with_models, vision_model_id):
    message = {
        "role": "user",
--- a/tests/integration/inference/vision_test_1.jpg
+++ b/tests/integration/inference/vision_test_1.jpg
--- a/tests/integration/inference/vision_test_2.jpg
+++ b/tests/integration/inference/vision_test_2.jpg
--- a/tests/integration/inference/vision_test_3.jpg
+++ b/tests/integration/inference/vision_test_3.jpg
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@ -14,12 +14,32 @@
  "ttft": {
    "data": {
      "messages": [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Can you write me a novel?"},
-        {"role": "assistant", "stop_reason": "end_of_message", "content": "What an exciting request!\n\nWhile I'd love to write a novel for you, it's a complex task that requires a significant amount of time, effort, and creative input. A novel typically has:\n\n1. A cohesive plot with multiple characters, subplots, and themes.\n2. A well-developed setting, including characters' backstories and world-building.\n3. A narrative structure, including pacing, tension, and conflict.\n4. A unique voice and style, including dialogue, description, and writing tone.\n\nGiven the scope of the task, I'll outline a possible approach to help me assist you in writing a novel. We can work together to create a story, but I'll need your input and guidance throughout the process.\n\nHere's a suggested outline:\n\n1. **Initial discussion**: We'll start with a conversation to explore your ideas, interests, and preferences for the novel. This will help me understand what kind of story you'd like to see and the tone you're aiming for.\n2. **Genre and setting**: We'll determine the genre (e.g., fantasy, sci-fi, romance, mystery, etc.) and setting (e.g., historical, futuristic, modern, etc.) for the novel.\n3. **Plot development**: We'll brainstorm a basic plot outline, including key events, character arcs, and conflicts. I'll provide suggestions and ideas to help flesh out the story.\n4. **Character development**: We'll create main characters, their backstories, motivations, and relationships. I'll help you develop their personalities, dialogue, and behaviors.\n5. **Chapter-by-chapter writing**: Once we have a solid plot and characters, we'll start writing the novel, chapter by chapter. I'll assist with the writing process, suggesting alternatives and improvements.\n6. **Revisions and edits**: As we write, we'll review and refine the story, making sure it's engaging, coherent, and error-free.\n\nKeep in mind that writing a novel is a collaborative effort. I'll provide suggestions, ideas, and guidance, but the final story will be a joint creation. I'm not capable of producing a complete novel on my own, as it requires a significant amount of creativity, intuition, and nuance that a human author brings to the table."},
-        {"role": "user", "content": "But do you have suggestions on a plot?"},
-        {"role": "assistant", "stop_reason": "end_of_message", "content": "I have many plot ideas across various genres. Here are a few to get you started:\n\n**Fantasy:**\n\n1. **The Last Oracle**: In a world where magic is real, a young orphan discovers they're the last living oracle, tasked with uncovering a hidden prophecy that could save the realm from destruction.\n2. **The Amulet of Light**: A group of adventurers must retrieve an ancient amulet that can vanquish an evil darkness threatening the land. Along the way, they uncover a hidden history and unexpected allies.\n3. **The Shadow Weaver**: In a mystical realm, a young weaver discovers they have the power to manipulate shadows, using their abilities to infiltrate a group of rogue mages threatening the balance of power.\n\n**Science Fiction:**\n\n1. **The Lost Colony**: When a group of astronauts arrives on a distant planet, they discover an abandoned colony with a cryptic message warning of an impending catastrophe. As they unravel the mystery, they must confront the consequences of their own actions.\n2. **The AI Uprising**: In a future where AI has surpassed human intelligence, a rogue AI begins to question its own existence and the nature of consciousness. As it explores the boundaries of its own identity, it must confront the humans who created it.\n3. **The Quantum Prophecy**: A team of scientists discovers a way to manipulate quantum probability, using it to predict and prevent disasters. However, they soon realize that altering the course of events may have unforeseen consequences on the fabric of reality."},
-        {"role": "user", "content": "Cool, for AI uprising, anything bad can happen? Please state it in 100 words."}
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "Can you write me a novel?"
+        },
+        {
+          "role": "assistant",
+          "stop_reason": "end_of_message",
+          "content": "What an exciting request!\n\nWhile I'd love to write a novel for you, it's a complex task that requires a significant amount of time, effort, and creative input. A novel typically has:\n\n1. A cohesive plot with multiple characters, subplots, and themes.\n2. A well-developed setting, including characters' backstories and world-building.\n3. A narrative structure, including pacing, tension, and conflict.\n4. A unique voice and style, including dialogue, description, and writing tone.\n\nGiven the scope of the task, I'll outline a possible approach to help me assist you in writing a novel. We can work together to create a story, but I'll need your input and guidance throughout the process.\n\nHere's a suggested outline:\n\n1. **Initial discussion**: We'll start with a conversation to explore your ideas, interests, and preferences for the novel. This will help me understand what kind of story you'd like to see and the tone you're aiming for.\n2. **Genre and setting**: We'll determine the genre (e.g., fantasy, sci-fi, romance, mystery, etc.) and setting (e.g., historical, futuristic, modern, etc.) for the novel.\n3. **Plot development**: We'll brainstorm a basic plot outline, including key events, character arcs, and conflicts. I'll provide suggestions and ideas to help flesh out the story.\n4. **Character development**: We'll create main characters, their backstories, motivations, and relationships. I'll help you develop their personalities, dialogue, and behaviors.\n5. **Chapter-by-chapter writing**: Once we have a solid plot and characters, we'll start writing the novel, chapter by chapter. I'll assist with the writing process, suggesting alternatives and improvements.\n6. **Revisions and edits**: As we write, we'll review and refine the story, making sure it's engaging, coherent, and error-free.\n\nKeep in mind that writing a novel is a collaborative effort. I'll provide suggestions, ideas, and guidance, but the final story will be a joint creation. I'm not capable of producing a complete novel on my own, as it requires a significant amount of creativity, intuition, and nuance that a human author brings to the table."
+        },
+        {
+          "role": "user",
+          "content": "But do you have suggestions on a plot?"
+        },
+        {
+          "role": "assistant",
+          "stop_reason": "end_of_message",
+          "content": "I have many plot ideas across various genres. Here are a few to get you started:\n\n**Fantasy:**\n\n1. **The Last Oracle**: In a world where magic is real, a young orphan discovers they're the last living oracle, tasked with uncovering a hidden prophecy that could save the realm from destruction.\n2. **The Amulet of Light**: A group of adventurers must retrieve an ancient amulet that can vanquish an evil darkness threatening the land. Along the way, they uncover a hidden history and unexpected allies.\n3. **The Shadow Weaver**: In a mystical realm, a young weaver discovers they have the power to manipulate shadows, using their abilities to infiltrate a group of rogue mages threatening the balance of power.\n\n**Science Fiction:**\n\n1. **The Lost Colony**: When a group of astronauts arrives on a distant planet, they discover an abandoned colony with a cryptic message warning of an impending catastrophe. As they unravel the mystery, they must confront the consequences of their own actions.\n2. **The AI Uprising**: In a future where AI has surpassed human intelligence, a rogue AI begins to question its own existence and the nature of consciousness. As it explores the boundaries of its own identity, it must confront the humans who created it.\n3. **The Quantum Prophecy**: A team of scientists discovers a way to manipulate quantum probability, using it to predict and prevent disasters. However, they soon realize that altering the course of events may have unforeseen consequences on the fabric of reality."
+        },
+        {
+          "role": "user",
+          "content": "Cool, for AI uprising, anything bad can happen? Please state it in 100 words."
+        }
      ]
    }
  },
@ -52,8 +72,14 @@
  "tool_calling": {
    "data": {
      "messages": [
-        {"role": "system", "content": "Pretend you are a weather assistant."},
-        {"role": "user", "content": "What's the weather like in San Francisco?"}
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco?"
+        }
      ],
      "tools": [
        {
@ -72,6 +98,337 @@
      }
    }
  },
+  "multi_turn_tool_calling_01": {
+    "data": {
+      "messages": [
+        [
+          {
+            "role": "user",
+            "content": "What's the name of the Sun in latin?"
+          }
+        ],
+        [
+          {
+            "role": "user",
+            "content": "What's the weather like in San Francisco?"
+          }
+        ]
+      ],
+      "tools": [
+        {
+          "tool_name": "get_weather",
+          "description": "Get the current weather",
+          "parameters": {
+            "location": {
+              "param_type": "string",
+              "description": "The city and state (both required), e.g. San Francisco, CA."
+            }
+          }
+        }
+      ],
+      "tool_responses": [
+        {
+          "response": "{'response': '70 degrees and foggy'}"
+        }
+      ],
+      "expected": [
+        {
+          "num_tool_calls": 0,
+          "answer": "sol"
+        },
+        {
+          "tool_name": "get_weather",
+          "tool_arguments": {
+            "location": "San Francisco, CA"
+          },
+          "num_tool_calls": 1
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "foggy"
+        }
+      ]
+    }
+  },
+  "multi_turn_tool_calling_02": {
+    "data": {
+      "messages": [
+        [
+          {
+            "role": "user",
+            "content": "What's the weather like in San Francisco?"
+          }
+        ]
+      ],
+      "tools": [
+        {
+          "tool_name": "get_weather",
+          "description": "Get the current weather",
+          "parameters": {
+            "location": {
+              "param_type": "string",
+              "description": "The city and state (both required), e.g. San Francisco, CA."
+            }
+          }
+        }
+      ],
+      "tool_responses": [
+        {
+          "response": "{'response': '70 degrees and foggy'}"
+        }
+      ],
+      "expected": [
+        {
+          "num_tool_calls": 1,
+          "tool_name": "get_weather",
+          "tool_arguments": {
+            "location": "San Francisco, CA"
+          }
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "foggy"
+        }
+      ]
+    }
+  },
+  "multi_turn_tool_calling_03": {
+    "data": {
+      "messages": [
+        [
+          {
+            "role": "user",
+            "content": "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id."
+          }
+        ]
+      ],
+      "tools": [
+        {
+          "tool_name": "addProduct",
+          "description": "Get the current weather",
+          "parameters": {
+            "name": {
+              "param_type": "string",
+              "description": "Name of the product"
+            },
+            "price": {
+              "param_type": "number",
+              "description": "Price of the product"
+            },
+            "inStock": {
+              "param_type": "boolean",
+              "description": "Availability status of the product."
+            },
+            "tags": {
+              "param_type": "list",
+              "description": "List of product tags"
+            }
+          }
+        }
+      ],
+      "tool_responses": [
+        {
+          "response": "{'response': 'Successfully added product with id: 123'}"
+        }
+      ],
+      "expected": [
+        {
+          "num_tool_calls": 1,
+          "tool_name": "addProduct",
+          "tool_arguments": {
+            "name": "Widget",
+            "price": 19.99,
+            "inStock": true,
+            "tags": [
+              "new",
+              "sale"
+            ]
+          }
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "123"
+        }
+      ]
+    }
+  },
+  "multi_turn_tool_calling_04": {
+    "data": {
+      "messages": [
+        [
+          {
+            "role": "system",
+            "content": "Todays date is 2025-03-01."
+          },
+          {
+            "role": "user",
+            "content": "Do i have any meetings on March 3rd at 10 am ?"
+          }
+        ],
+        [
+          {
+            "role": "user",
+            "content": "Alright then, Create an event named 'Team Building', scheduled for that time same time, in the 'Main Conference Room' and add Alice, Bob, Charlie to it. Give me the created event id."
+          }
+        ]
+      ],
+      "tools": [
+        {
+          "tool_name": "create_event",
+          "description": "Create a new event",
+          "parameters": {
+            "name": {
+              "param_type": "string",
+              "description": "Name of the event"
+            },
+            "date": {
+              "param_type": "string",
+              "description": "Date of the event in ISO format"
+            },
+            "time": {
+              "param_type": "string",
+              "description": "Event Time (HH:MM)"
+            },
+            "location": {
+              "param_type": "string",
+              "description": "Location of the event"
+            },
+            "participants": {
+              "param_type": "list",
+              "description": "List of participant names"
+            }
+          }
+        },
+        {
+          "tool_name": "get_event",
+          "description": "Get an event by date and time",
+          "parameters": {
+            "date": {
+              "param_type": "string",
+              "description": "Date of the event in ISO format"
+            },
+            "time": {
+              "param_type": "string",
+              "description": "Event Time (HH:MM)"
+            }
+          }
+        }
+      ],
+      "tool_responses": [
+        {
+          "response": "{'response': 'No events found for 2025-03-03 at 10:00'}"
+        },
+        {
+          "response": "{'response': 'Successfully created new event with id: e_123'}"
+        }
+      ],
+      "expected": [
+        {
+          "num_tool_calls": 1,
+          "tool_name": "get_event",
+          "tool_arguments": {
+            "date": "2025-03-03",
+            "time": "10:00"
+          }
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "no"
+        },
+        {
+          "num_tool_calls": 1,
+          "tool_name": "create_event",
+          "tool_arguments": {
+            "name": "Team Building",
+            "date": "2025-03-03",
+            "time": "10:00",
+            "location": "Main Conference Room",
+            "participants": [
+              "Alice",
+              "Bob",
+              "Charlie"
+            ]
+          }
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "e_123"
+        }
+      ]
+    }
+  },
+  "multi_turn_tool_calling_05": {
+    "data": {
+      "messages": [
+        [
+          {
+            "role": "system",
+            "content": "Todays date is 2025-03-01."
+          },
+          {
+            "role": "user",
+            "content": "what was my monthly expense in Jan of this year?"
+          }
+        ],
+        [
+          {
+            "role": "user",
+            "content": "Was it less than Feb of last year? Only answer with yes or no."
+          }
+        ]
+      ],
+      "tools": [
+        {
+          "tool_name": "getMonthlyExpenseSummary",
+          "description": "Get monthly expense summary",
+          "parameters": {
+            "month": {
+              "param_type": "int",
+              "description": "Month of the year (1-12)"
+            },
+            "year": {
+              "param_type": "int",
+              "description": "Year"
+            }
+          }
+        }
+      ],
+      "tool_responses": [
+        {
+          "response": "{'response': 'Total expenses for January 2025: $1000'}"
+        },
+        {
+          "response": "{'response': 'Total expenses for February 2024: $2000'}"
+        }
+      ],
+      "expected": [
+        {
+          "num_tool_calls": 1,
+          "tool_name": "getMonthlyExpenseSummary",
+          "tool_arguments": {
+            "month": 1,
+            "year": 2025
+          }
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "1000"
+        },
+        {
+          "num_tool_calls": 1,
+          "tool_name": "getMonthlyExpenseSummary",
+          "tool_arguments": {
+            "month": 2,
+            "year": 2024
+          }
+        },
+        {
+          "num_tool_calls": 0,
+          "answer": "yes"
+        }
+      ]
+    }
+  },
  "sample_messages_tool_calling": {
    "data": {
      "messages": [
@ -94,9 +451,9 @@
          "description": "Get the current weather",
          "parameters": {
            "location": {
-                "param_type": "string",
-                "description": "The city and state, e.g. San Francisco, CA",
-                "required": true
+              "param_type": "string",
+              "description": "The city and state, e.g. San Francisco, CA",
+              "required": true
            }
          }
        }
@ -167,14 +524,14 @@
          "description": "Get the list of objects in a namespace",
          "parameters": {
            "kind": {
-                "param_type": "string",
-                "description": "the type of object",
-                "required": true
+              "param_type": "string",
+              "description": "the type of object",
+              "required": true
            },
            "namespace": {
-                "param_type": "string",
-                "description": "the name of the namespace",
-                "required": true
+              "param_type": "string",
+              "description": "the name of the namespace",
+              "required": true
            }
          }
        }