diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index 1fcd6f7af..9592a18fe 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 8f23cef43..fb3f9164a 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 01f38807b..b7e155385 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -21,7 +21,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
index f361e93c7..fa02199b0 100644
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 80e9ecb7c..73f9c9672 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -366,7 +366,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
                 provider_id = list(self.impls_by_provider_id.keys())[0]
             else:
                 raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
+                    f"No provider specified and multiple providers available. Please specify a provider_id. Available providers: {self.impls_by_provider_id.keys()}"
                 )
         if metadata is None:
             metadata = {}
diff --git a/llama_stack/providers/tests/env.py b/llama_stack/env.py
similarity index 100%
rename from llama_stack/providers/tests/env.py
rename to llama_stack/env.py
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 921beac27..f868bee2c 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -12,7 +12,7 @@ import secrets
 import string
 import uuid
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
 import httpx
@@ -31,7 +31,6 @@ from llama_stack.apis.agents import (
     AgentTurnResponseStreamChunk,
     AgentTurnResponseTurnAwaitingInputPayload,
     AgentTurnResponseTurnCompletePayload,
-    AgentTurnResponseTurnStartPayload,
     AgentTurnResumeRequest,
     Attachment,
     Document,
@@ -184,115 +183,49 @@ class ChatAgent(ShieldRunnerMixin):
             span.set_attribute("session_id", request.session_id)
             span.set_attribute("agent_id", self.agent_id)
             span.set_attribute("request", request.model_dump_json())
-            assert request.stream is True, "Non-streaming not supported"
-
-            session_info = await self.storage.get_session_info(request.session_id)
-            if session_info is None:
-                raise ValueError(f"Session {request.session_id} not found")
-
-            turns = await self.storage.get_session_turns(request.session_id)
-            messages = await self.get_messages_from_turns(turns)
-            messages.extend(request.messages)
-
             turn_id = str(uuid.uuid4())
             span.set_attribute("turn_id", turn_id)
-            start_time = datetime.now().astimezone().isoformat()
-            yield AgentTurnResponseStreamChunk(
-                event=AgentTurnResponseEvent(
-                    payload=AgentTurnResponseTurnStartPayload(
-                        turn_id=turn_id,
-                    )
-                )
-            )
-
-            steps = []
-            output_message = None
-            async for chunk in self.run(
-                session_id=request.session_id,
-                turn_id=turn_id,
-                input_messages=messages,
-                sampling_params=self.agent_config.sampling_params,
-                stream=request.stream,
-                documents=request.documents,
-                toolgroups_for_turn=request.toolgroups,
-            ):
-                if isinstance(chunk, CompletionMessage):
-                    logcat.info(
-                        "agents",
-                        f"returning result from the agent turn: {chunk}",
-                    )
-                    output_message = chunk
-                    continue
-
-                assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
-                event = chunk.event
-                if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
-                    steps.append(event.payload.step_details)
-
+            async for chunk in self._run_turn(request, turn_id):
                 yield chunk
 
-            assert output_message is not None
-
-            turn = Turn(
-                turn_id=turn_id,
-                session_id=request.session_id,
-                input_messages=request.messages,
-                output_message=output_message,
-                started_at=start_time,
-                completed_at=datetime.now().astimezone().isoformat(),
-                steps=steps,
-            )
-            await self.storage.add_turn_to_session(request.session_id, turn)
-            if output_message.tool_calls:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnAwaitingInputPayload(
-                            turn=turn,
-                        )
-                    )
-                )
-            else:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnCompletePayload(
-                            turn=turn,
-                        )
-                    )
-                )
-
-            yield chunk
-
     async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
         with tracing.span("resume_turn") as span:
             span.set_attribute("agent_id", self.agent_id)
             span.set_attribute("session_id", request.session_id)
             span.set_attribute("turn_id", request.turn_id)
             span.set_attribute("request", request.model_dump_json())
-            assert request.stream is True, "Non-streaming not supported"
+            async for chunk in self._run_turn(request):
+                yield chunk
 
-            session_info = await self.storage.get_session_info(request.session_id)
-            if session_info is None:
-                raise ValueError(f"Session {request.session_id} not found")
+    async def _run_turn(
+        self,
+        request: Union[AgentTurnCreateRequest, AgentTurnResumeRequest],
+        turn_id: Optional[str] = None,
+    ) -> AsyncGenerator:
+        assert request.stream is True, "Non-streaming not supported"
 
-            turns = await self.storage.get_session_turns(request.session_id)
-            if len(turns) == 0:
-                raise ValueError("No turns found for session")
+        is_resume = isinstance(request, AgentTurnResumeRequest)
+        session_info = await self.storage.get_session_info(request.session_id)
+        if session_info is None:
+            raise ValueError(f"Session {request.session_id} not found")
 
-            messages = await self.get_messages_from_turns(turns)
+        turns = await self.storage.get_session_turns(request.session_id)
+        if is_resume and len(turns) == 0:
+            raise ValueError("No turns found for session")
+
+        steps = []
+        messages = await self.get_messages_from_turns(turns)
+        if is_resume:
             messages.extend(request.tool_responses)
-
             last_turn = turns[-1]
             last_turn_messages = self.turn_to_messages(last_turn)
             last_turn_messages = [
                 x for x in last_turn_messages if isinstance(x, UserMessage) or isinstance(x, ToolResponseMessage)
             ]
-
-            # TODO: figure out whether we should add the tool responses to the last turn messages
             last_turn_messages.extend(request.tool_responses)
 
-            # get the steps from the turn id
-            steps = []
-            steps = turns[-1].steps
+            # get steps from the turn
+            steps = last_turn.steps
 
             # mark tool execution step as complete
             # if there's no tool execution in progress step (due to storage, or tool call parsing on client),
@@ -326,62 +259,67 @@ class ChatAgent(ShieldRunnerMixin):
                     )
                 )
             )
+            input_messages = last_turn_messages
 
-            output_message = None
-            async for chunk in self.run(
-                session_id=request.session_id,
-                turn_id=request.turn_id,
-                input_messages=messages,
-                sampling_params=self.agent_config.sampling_params,
-                stream=request.stream,
-            ):
-                if isinstance(chunk, CompletionMessage):
-                    output_message = chunk
-                    continue
+            turn_id = request.turn_id
+            start_time = last_turn.started_at
+        else:
+            messages.extend(request.messages)
+            start_time = datetime.now().astimezone().isoformat()
+            input_messages = request.messages
 
-                assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
-                event = chunk.event
-                if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
-                    steps.append(event.payload.step_details)
+        output_message = None
+        async for chunk in self.run(
+            session_id=request.session_id,
+            turn_id=turn_id,
+            input_messages=messages,
+            sampling_params=self.agent_config.sampling_params,
+            stream=request.stream,
+            documents=request.documents if not is_resume else None,
+            toolgroups_for_turn=request.toolgroups if not is_resume else None,
+        ):
+            if isinstance(chunk, CompletionMessage):
+                output_message = chunk
+                continue
 
-                yield chunk
-
-            assert output_message is not None
-
-            last_turn_start_time = datetime.now().astimezone().isoformat()
-            if len(turns) > 0:
-                last_turn_start_time = turns[-1].started_at
-
-            turn = Turn(
-                turn_id=request.turn_id,
-                session_id=request.session_id,
-                input_messages=last_turn_messages,
-                output_message=output_message,
-                started_at=last_turn_start_time,
-                completed_at=datetime.now().astimezone().isoformat(),
-                steps=steps,
-            )
-            await self.storage.add_turn_to_session(request.session_id, turn)
-
-            if output_message.tool_calls:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnAwaitingInputPayload(
-                            turn=turn,
-                        )
-                    )
-                )
-            else:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnCompletePayload(
-                            turn=turn,
-                        )
-                    )
-                )
+            assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
+            event = chunk.event
+            if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
+                steps.append(event.payload.step_details)
 
             yield chunk
 
+        assert output_message is not None
+
+        turn = Turn(
+            turn_id=turn_id,
+            session_id=request.session_id,
+            input_messages=input_messages,
+            output_message=output_message,
+            started_at=start_time,
+            completed_at=datetime.now().astimezone().isoformat(),
+            steps=steps,
+        )
+        await self.storage.add_turn_to_session(request.session_id, turn)
+        if output_message.tool_calls:
+            chunk = AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseTurnAwaitingInputPayload(
+                        turn=turn,
+                    )
+                )
+            )
+        else:
+            chunk = AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseTurnCompletePayload(
+                        turn=turn,
+                    )
+                )
+            )
+
+        yield chunk
+
     async def run(
         self,
         session_id: str,
diff --git a/llama_stack/providers/tests/datasetio/conftest.py b/llama_stack/providers/tests/datasetio/conftest.py
deleted file mode 100644
index 740eddb33..000000000
--- a/llama_stack/providers/tests/datasetio/conftest.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from .fixtures import DATASETIO_FIXTURES
-
-
-def pytest_configure(config):
-    for fixture_name in DATASETIO_FIXTURES:
-        config.addinivalue_line(
-            "markers",
-            f"{fixture_name}: marks tests as {fixture_name} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    if "datasetio_stack" in metafunc.fixturenames:
-        metafunc.parametrize(
-            "datasetio_stack",
-            [
-                pytest.param(fixture_name, marks=getattr(pytest.mark, fixture_name))
-                for fixture_name in DATASETIO_FIXTURES
-            ],
-            indirect=True,
-        )
diff --git a/llama_stack/providers/tests/datasetio/fixtures.py b/llama_stack/providers/tests/datasetio/fixtures.py
deleted file mode 100644
index 27aedb645..000000000
--- a/llama_stack/providers/tests/datasetio/fixtures.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture, remote_stack_fixture
-
-
-@pytest.fixture(scope="session")
-def datasetio_remote() -> ProviderFixture:
-    return remote_stack_fixture()
-
-
-@pytest.fixture(scope="session")
-def datasetio_localfs() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="localfs",
-                provider_type="inline::localfs",
-                config={},
-            )
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def datasetio_huggingface() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="huggingface",
-                provider_type="remote::huggingface",
-                config={},
-            )
-        ],
-    )
-
-
-DATASETIO_FIXTURES = ["localfs", "remote", "huggingface"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def datasetio_stack(request):
-    fixture_name = request.param
-    fixture = request.getfixturevalue(f"datasetio_{fixture_name}")
-
-    test_stack = await construct_stack_for_test(
-        [Api.datasetio],
-        {"datasetio": fixture.providers},
-        fixture.provider_data,
-    )
-
-    return test_stack.impls[Api.datasetio], test_stack.impls[Api.datasets]
diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py
deleted file mode 100644
index fd76bafe0..000000000
--- a/llama_stack/providers/tests/datasetio/test_datasetio.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import mimetypes
-import os
-from pathlib import Path
-
-import pytest
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
-from llama_stack.apis.datasets import Datasets
-
-# How to run this test:
-#
-# pytest llama_stack/providers/tests/datasetio/test_datasetio.py
-#   -m "meta_reference"
-#   -v -s --tb=short --disable-warnings
-
-
-def data_url_from_file(file_path: str) -> str:
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
-
-    with open(file_path, "rb") as file:
-        file_content = file.read()
-
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
-
-
-async def register_dataset(
-    datasets_impl: Datasets,
-    for_generation=False,
-    for_rag=False,
-    dataset_id="test_dataset",
-):
-    if for_rag:
-        test_file = Path(os.path.abspath(__file__)).parent / "test_rag_dataset.csv"
-    else:
-        test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv"
-    test_url = data_url_from_file(str(test_file))
-
-    if for_generation:
-        dataset_schema = {
-            "expected_answer": StringType(),
-            "input_query": StringType(),
-            "chat_completion_input": ChatCompletionInputType(),
-        }
-    elif for_rag:
-        dataset_schema = {
-            "expected_answer": StringType(),
-            "input_query": StringType(),
-            "generated_answer": StringType(),
-            "context": StringType(),
-        }
-    else:
-        dataset_schema = {
-            "expected_answer": StringType(),
-            "input_query": StringType(),
-            "generated_answer": StringType(),
-        }
-
-    await datasets_impl.register_dataset(
-        dataset_id=dataset_id,
-        dataset_schema=dataset_schema,
-        url=URL(uri=test_url),
-    )
-
-
-class TestDatasetIO:
-    @pytest.mark.asyncio
-    async def test_datasets_list(self, datasetio_stack):
-        # NOTE: this needs you to ensure that you are starting from a clean state
-        # but so far we don't have an unregister API unfortunately, so be careful
-        _, datasets_impl = datasetio_stack
-        response = await datasets_impl.list_datasets()
-        assert isinstance(response, list)
-        assert len(response) == 0
-
-    @pytest.mark.asyncio
-    async def test_register_dataset(self, datasetio_stack):
-        _, datasets_impl = datasetio_stack
-        await register_dataset(datasets_impl)
-        response = await datasets_impl.list_datasets()
-        assert isinstance(response, list)
-        assert len(response) == 1
-        assert response[0].identifier == "test_dataset"
-
-        with pytest.raises(ValueError):
-            # unregister a dataset that does not exist
-            await datasets_impl.unregister_dataset("test_dataset2")
-
-        await datasets_impl.unregister_dataset("test_dataset")
-        response = await datasets_impl.list_datasets()
-        assert isinstance(response, list)
-        assert len(response) == 0
-
-        with pytest.raises(ValueError):
-            await datasets_impl.unregister_dataset("test_dataset")
-
-    @pytest.mark.asyncio
-    async def test_get_rows_paginated(self, datasetio_stack):
-        datasetio_impl, datasets_impl = datasetio_stack
-        await register_dataset(datasets_impl)
-        response = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert isinstance(response.rows, list)
-        assert len(response.rows) == 3
-        assert response.next_page_token == "3"
-
-        provider = datasetio_impl.routing_table.get_provider_impl("test_dataset")
-        if provider.__provider_spec__.provider_type == "remote":
-            pytest.skip("remote provider doesn't support get_rows_paginated")
-
-        # iterate over all rows
-        response = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=2,
-            page_token=response.next_page_token,
-        )
-        assert isinstance(response.rows, list)
-        assert len(response.rows) == 2
-        assert response.next_page_token == "5"
diff --git a/llama_stack/providers/tests/eval/conftest.py b/llama_stack/providers/tests/eval/conftest.py
deleted file mode 100644
index c1da6ba42..000000000
--- a/llama_stack/providers/tests/eval/conftest.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..agents.fixtures import AGENTS_FIXTURES
-from ..conftest import get_provider_fixture_overrides
-from ..datasetio.fixtures import DATASETIO_FIXTURES
-from ..inference.fixtures import INFERENCE_FIXTURES
-from ..safety.fixtures import SAFETY_FIXTURES
-from ..scoring.fixtures import SCORING_FIXTURES
-from ..tools.fixtures import TOOL_RUNTIME_FIXTURES
-from ..vector_io.fixtures import VECTOR_IO_FIXTURES
-from .fixtures import EVAL_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "eval": "meta_reference",
-            "scoring": "basic",
-            "datasetio": "localfs",
-            "inference": "fireworks",
-            "agents": "meta_reference",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="meta_reference_eval_fireworks_inference",
-        marks=pytest.mark.meta_reference_eval_fireworks_inference,
-    ),
-    pytest.param(
-        {
-            "eval": "meta_reference",
-            "scoring": "basic",
-            "datasetio": "localfs",
-            "inference": "together",
-            "agents": "meta_reference",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="meta_reference_eval_together_inference",
-        marks=pytest.mark.meta_reference_eval_together_inference,
-    ),
-    pytest.param(
-        {
-            "eval": "meta_reference",
-            "scoring": "basic",
-            "datasetio": "huggingface",
-            "inference": "together",
-            "agents": "meta_reference",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="meta_reference_eval_together_inference_huggingface_datasetio",
-        marks=pytest.mark.meta_reference_eval_together_inference_huggingface_datasetio,
-    ),
-]
-
-
-def pytest_configure(config):
-    for fixture_name in [
-        "meta_reference_eval_fireworks_inference",
-        "meta_reference_eval_together_inference",
-        "meta_reference_eval_together_inference_huggingface_datasetio",
-    ]:
-        config.addinivalue_line(
-            "markers",
-            f"{fixture_name}: marks tests as {fixture_name} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    if "eval_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "eval": EVAL_FIXTURES,
-            "scoring": SCORING_FIXTURES,
-            "datasetio": DATASETIO_FIXTURES,
-            "inference": INFERENCE_FIXTURES,
-            "agents": AGENTS_FIXTURES,
-            "safety": SAFETY_FIXTURES,
-            "vector_io": VECTOR_IO_FIXTURES,
-            "tool_runtime": TOOL_RUNTIME_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("eval_stack", combinations, indirect=True)
diff --git a/llama_stack/providers/tests/eval/fixtures.py b/llama_stack/providers/tests/eval/fixtures.py
deleted file mode 100644
index c6d15bbf5..000000000
--- a/llama_stack/providers/tests/eval/fixtures.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.distribution.datatypes import Api, ModelInput, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture, remote_stack_fixture
-
-
-@pytest.fixture(scope="session")
-def eval_remote() -> ProviderFixture:
-    return remote_stack_fixture()
-
-
-@pytest.fixture(scope="session")
-def eval_meta_reference() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="meta-reference",
-                provider_type="inline::meta-reference",
-                config={},
-            )
-        ],
-    )
-
-
-EVAL_FIXTURES = ["meta_reference", "remote"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def eval_stack(
-    request,
-    inference_model,
-    judge_model,
-    tool_group_input_memory,
-    tool_group_input_tavily_search,
-):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in [
-        "datasetio",
-        "eval",
-        "scoring",
-        "inference",
-        "agents",
-        "safety",
-        "vector_io",
-        "tool_runtime",
-    ]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-
-    test_stack = await construct_stack_for_test(
-        [
-            Api.eval,
-            Api.datasetio,
-            Api.inference,
-            Api.scoring,
-            Api.agents,
-            Api.safety,
-            Api.vector_io,
-            Api.tool_runtime,
-        ],
-        providers,
-        provider_data,
-        models=[
-            ModelInput(model_id=model)
-            for model in [
-                inference_model,
-                judge_model,
-            ]
-        ],
-        tool_groups=[tool_group_input_memory, tool_group_input_tavily_search],
-    )
-
-    return test_stack.impls
diff --git a/llama_stack/providers/tests/post_training/conftest.py b/llama_stack/providers/tests/post_training/conftest.py
deleted file mode 100644
index b6d95444b..000000000
--- a/llama_stack/providers/tests/post_training/conftest.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..conftest import get_provider_fixture_overrides
-from ..datasetio.fixtures import DATASETIO_FIXTURES
-from .fixtures import POST_TRAINING_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "post_training": "torchtune",
-            "datasetio": "huggingface",
-        },
-        id="torchtune_post_training_huggingface_datasetio",
-        marks=pytest.mark.torchtune_post_training_huggingface_datasetio,
-    ),
-]
-
-
-def pytest_configure(config):
-    combined_fixtures = "torchtune_post_training_huggingface_datasetio"
-    config.addinivalue_line(
-        "markers",
-        f"{combined_fixtures}: marks tests as {combined_fixtures} specific",
-    )
-
-
-def pytest_generate_tests(metafunc):
-    if "post_training_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "eval": POST_TRAINING_FIXTURES,
-            "datasetio": DATASETIO_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("post_training_stack", combinations, indirect=True)
diff --git a/llama_stack/providers/tests/post_training/fixtures.py b/llama_stack/providers/tests/post_training/fixtures.py
deleted file mode 100644
index 7c3ff3ddb..000000000
--- a/llama_stack/providers/tests/post_training/fixtures.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import StringType
-from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.models import ModelInput
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture
-
-
-@pytest.fixture(scope="session")
-def post_training_torchtune() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="torchtune",
-                provider_type="inline::torchtune",
-                config={},
-            )
-        ],
-    )
-
-
-POST_TRAINING_FIXTURES = ["torchtune"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def post_training_stack(request):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in ["post_training", "datasetio"]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-
-    test_stack = await construct_stack_for_test(
-        [Api.post_training, Api.datasetio],
-        providers,
-        provider_data,
-        models=[ModelInput(model_id="meta-llama/Llama-3.2-3B-Instruct")],
-        datasets=[
-            DatasetInput(
-                dataset_id="alpaca",
-                provider_id="huggingface",
-                url=URL(uri="https://huggingface.co/datasets/tatsu-lab/alpaca"),
-                metadata={
-                    "path": "tatsu-lab/alpaca",
-                    "split": "train",
-                },
-                dataset_schema={
-                    "instruction": StringType(),
-                    "input": StringType(),
-                    "output": StringType(),
-                    "text": StringType(),
-                },
-            ),
-        ],
-    )
-
-    return test_stack.impls[Api.post_training]
diff --git a/llama_stack/providers/tests/scoring/conftest.py b/llama_stack/providers/tests/scoring/conftest.py
deleted file mode 100644
index 9278d3c2d..000000000
--- a/llama_stack/providers/tests/scoring/conftest.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..conftest import get_provider_fixture_overrides
-from ..datasetio.fixtures import DATASETIO_FIXTURES
-from ..inference.fixtures import INFERENCE_FIXTURES
-from .fixtures import SCORING_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "scoring": "basic",
-            "datasetio": "localfs",
-            "inference": "together",
-        },
-        id="basic_scoring_together_inference",
-        marks=pytest.mark.basic_scoring_together_inference,
-    ),
-    pytest.param(
-        {
-            "scoring": "braintrust",
-            "datasetio": "localfs",
-            "inference": "together",
-        },
-        id="braintrust_scoring_together_inference",
-        marks=pytest.mark.braintrust_scoring_together_inference,
-    ),
-    pytest.param(
-        {
-            "scoring": "llm_as_judge",
-            "datasetio": "localfs",
-            "inference": "together",
-        },
-        id="llm_as_judge_scoring_together_inference",
-        marks=pytest.mark.llm_as_judge_scoring_together_inference,
-    ),
-]
-
-
-def pytest_configure(config):
-    for fixture_name in [
-        "basic_scoring_together_inference",
-        "braintrust_scoring_together_inference",
-        "llm_as_judge_scoring_together_inference",
-    ]:
-        config.addinivalue_line(
-            "markers",
-            f"{fixture_name}: marks tests as {fixture_name} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    judge_model = metafunc.config.getoption("--judge-model")
-    if "judge_model" in metafunc.fixturenames:
-        metafunc.parametrize(
-            "judge_model",
-            [pytest.param(judge_model, id="")],
-            indirect=True,
-        )
-
-    if "scoring_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "scoring": SCORING_FIXTURES,
-            "datasetio": DATASETIO_FIXTURES,
-            "inference": INFERENCE_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("scoring_stack", combinations, indirect=True)
diff --git a/llama_stack/providers/tests/scoring/fixtures.py b/llama_stack/providers/tests/scoring/fixtures.py
deleted file mode 100644
index 09f31cbc2..000000000
--- a/llama_stack/providers/tests/scoring/fixtures.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.models import ModelInput
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.inline.scoring.braintrust import BraintrustScoringConfig
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture, remote_stack_fixture
-from ..env import get_env_or_fail
-
-
-@pytest.fixture(scope="session")
-def scoring_remote() -> ProviderFixture:
-    return remote_stack_fixture()
-
-
-@pytest.fixture(scope="session")
-def judge_model(request):
-    if hasattr(request, "param"):
-        return request.param
-    return request.config.getoption("--judge-model", None)
-
-
-@pytest.fixture(scope="session")
-def scoring_basic() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="basic",
-                provider_type="inline::basic",
-                config={},
-            )
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def scoring_braintrust() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="braintrust",
-                provider_type="inline::braintrust",
-                config=BraintrustScoringConfig(
-                    openai_api_key=get_env_or_fail("OPENAI_API_KEY"),
-                ).model_dump(),
-            )
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def scoring_llm_as_judge() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="llm-as-judge",
-                provider_type="inline::llm-as-judge",
-                config={},
-            )
-        ],
-    )
-
-
-SCORING_FIXTURES = ["basic", "remote", "braintrust", "llm_as_judge"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def scoring_stack(request, inference_model, judge_model):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in ["datasetio", "scoring", "inference"]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-
-    test_stack = await construct_stack_for_test(
-        [Api.scoring, Api.datasetio, Api.inference],
-        providers,
-        provider_data,
-        models=[
-            ModelInput(model_id=model)
-            for model in [
-                inference_model,
-                judge_model,
-            ]
-        ],
-    )
-
-    return test_stack.impls
diff --git a/llama_stack/providers/tests/scoring/test_scoring.py b/llama_stack/providers/tests/scoring/test_scoring.py
deleted file mode 100644
index d80b105f4..000000000
--- a/llama_stack/providers/tests/scoring/test_scoring.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import pytest
-
-from llama_stack.apis.scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-)
-from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
-
-# How to run this test:
-#
-# pytest llama_stack/providers/tests/scoring/test_scoring.py
-#   -m "meta_reference"
-#   -v -s --tb=short --disable-warnings
-
-
-@pytest.fixture
-def sample_judge_prompt_template():
-    return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
-
-
-class TestScoring:
-    @pytest.mark.asyncio
-    async def test_scoring_functions_list(self, scoring_stack):
-        # NOTE: this needs you to ensure that you are starting from a clean state
-        # but so far we don't have an unregister API unfortunately, so be careful
-        scoring_functions_impl = scoring_stack[Api.scoring_functions]
-        response = await scoring_functions_impl.list_scoring_functions()
-        assert isinstance(response, list)
-        assert len(response) > 0
-
-    @pytest.mark.asyncio
-    async def test_scoring_score(self, scoring_stack):
-        (
-            scoring_impl,
-            scoring_functions_impl,
-            datasetio_impl,
-            datasets_impl,
-        ) = (
-            scoring_stack[Api.scoring],
-            scoring_stack[Api.scoring_functions],
-            scoring_stack[Api.datasetio],
-            scoring_stack[Api.datasets],
-        )
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        provider_id = scoring_fns_list[0].provider_id
-        if provider_id == "llm-as-judge":
-            pytest.skip(f"{provider_id} provider does not support scoring without params")
-
-        await register_dataset(datasets_impl, for_rag=True)
-        response = await datasets_impl.list_datasets()
-        assert len(response) == 1
-
-        # scoring individual rows
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        scoring_functions = {
-            scoring_fns_list[0].identifier: None,
-        }
-
-        response = await scoring_impl.score(
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == len(rows.rows)
-
-        # score batch
-        response = await scoring_impl.score_batch(
-            dataset_id="test_dataset",
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == 5
-
-    @pytest.mark.asyncio
-    async def test_scoring_score_with_params_llm_as_judge(
-        self, scoring_stack, sample_judge_prompt_template, judge_model
-    ):
-        (
-            scoring_impl,
-            scoring_functions_impl,
-            datasetio_impl,
-            datasets_impl,
-        ) = (
-            scoring_stack[Api.scoring],
-            scoring_stack[Api.scoring_functions],
-            scoring_stack[Api.datasetio],
-            scoring_stack[Api.datasets],
-        )
-        await register_dataset(datasets_impl, for_rag=True)
-        response = await datasets_impl.list_datasets()
-        assert len(response) == 1
-
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        provider_id = scoring_fns_list[0].provider_id
-        if provider_id == "braintrust" or provider_id == "basic":
-            pytest.skip(f"{provider_id} provider does not support scoring with params")
-
-        # scoring individual rows
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_functions = {
-            "llm-as-judge::base": LLMAsJudgeScoringFnParams(
-                judge_model=judge_model,
-                prompt_template=sample_judge_prompt_template,
-                judge_score_regexes=[r"Score: (\d+)"],
-                aggregation_functions=[AggregationFunctionType.categorical_count],
-            )
-        }
-
-        response = await scoring_impl.score(
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == len(rows.rows)
-
-        # score batch
-        response = await scoring_impl.score_batch(
-            dataset_id="test_dataset",
-            scoring_functions=scoring_functions,
-        )
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == 5
-
-    @pytest.mark.asyncio
-    async def test_scoring_score_with_aggregation_functions(
-        self, scoring_stack, sample_judge_prompt_template, judge_model
-    ):
-        (
-            scoring_impl,
-            scoring_functions_impl,
-            datasetio_impl,
-            datasets_impl,
-        ) = (
-            scoring_stack[Api.scoring],
-            scoring_stack[Api.scoring_functions],
-            scoring_stack[Api.datasetio],
-            scoring_stack[Api.datasets],
-        )
-        await register_dataset(datasets_impl, for_rag=True)
-        rows = await datasetio_impl.get_rows_paginated(
-            dataset_id="test_dataset",
-            rows_in_page=3,
-        )
-        assert len(rows.rows) == 3
-
-        scoring_fns_list = await scoring_functions_impl.list_scoring_functions()
-        scoring_functions = {}
-        aggr_fns = [
-            AggregationFunctionType.accuracy,
-            AggregationFunctionType.median,
-            AggregationFunctionType.categorical_count,
-            AggregationFunctionType.average,
-        ]
-        for x in scoring_fns_list:
-            if x.provider_id == "llm-as-judge":
-                aggr_fns = [AggregationFunctionType.categorical_count]
-                scoring_functions[x.identifier] = LLMAsJudgeScoringFnParams(
-                    judge_model=judge_model,
-                    prompt_template=sample_judge_prompt_template,
-                    judge_score_regexes=[r"Score: (\d+)"],
-                    aggregation_functions=aggr_fns,
-                )
-            elif x.provider_id == "basic" or x.provider_id == "braintrust":
-                if "regex_parser" in x.identifier:
-                    scoring_functions[x.identifier] = RegexParserScoringFnParams(
-                        aggregation_functions=aggr_fns,
-                    )
-                else:
-                    scoring_functions[x.identifier] = BasicScoringFnParams(
-                        aggregation_functions=aggr_fns,
-                    )
-            else:
-                scoring_functions[x.identifier] = None
-
-        response = await scoring_impl.score(
-            input_rows=rows.rows,
-            scoring_functions=scoring_functions,
-        )
-
-        assert len(response.results) == len(scoring_functions)
-        for x in scoring_functions:
-            assert x in response.results
-            assert len(response.results[x].score_rows) == len(rows.rows)
-            assert len(response.results[x].aggregated_results) == len(aggr_fns)
diff --git a/llama_stack/providers/tests/tools/__init__.py b/llama_stack/providers/tests/tools/__init__.py
deleted file mode 100644
index 756f351d8..000000000
--- a/llama_stack/providers/tests/tools/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/llama_stack/providers/tests/tools/conftest.py b/llama_stack/providers/tests/tools/conftest.py
deleted file mode 100644
index 253ae88f0..000000000
--- a/llama_stack/providers/tests/tools/conftest.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-
-from ..conftest import get_provider_fixture_overrides
-from ..inference.fixtures import INFERENCE_FIXTURES
-from ..safety.fixtures import SAFETY_FIXTURES
-from ..vector_io.fixtures import VECTOR_IO_FIXTURES
-from .fixtures import TOOL_RUNTIME_FIXTURES
-
-DEFAULT_PROVIDER_COMBINATIONS = [
-    pytest.param(
-        {
-            "inference": "together",
-            "safety": "llama_guard",
-            "vector_io": "faiss",
-            "tool_runtime": "memory_and_search",
-        },
-        id="together",
-        marks=pytest.mark.together,
-    ),
-]
-
-
-def pytest_configure(config):
-    for mark in ["together"]:
-        config.addinivalue_line(
-            "markers",
-            f"{mark}: marks tests as {mark} specific",
-        )
-
-
-def pytest_generate_tests(metafunc):
-    if "tools_stack" in metafunc.fixturenames:
-        available_fixtures = {
-            "inference": INFERENCE_FIXTURES,
-            "safety": SAFETY_FIXTURES,
-            "vector_io": VECTOR_IO_FIXTURES,
-            "tool_runtime": TOOL_RUNTIME_FIXTURES,
-        }
-        combinations = (
-            get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
-        )
-        metafunc.parametrize("tools_stack", combinations, indirect=True)
diff --git a/llama_stack/providers/tests/tools/fixtures.py b/llama_stack/providers/tests/tools/fixtures.py
deleted file mode 100644
index ddf8e9af2..000000000
--- a/llama_stack/providers/tests/tools/fixtures.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-import pytest
-import pytest_asyncio
-
-from llama_stack.apis.models import ModelInput, ModelType
-from llama_stack.apis.tools import ToolGroupInput
-from llama_stack.distribution.datatypes import Api, Provider
-from llama_stack.providers.tests.resolver import construct_stack_for_test
-
-from ..conftest import ProviderFixture
-
-
-@pytest.fixture(scope="session")
-def tool_runtime_memory_and_search() -> ProviderFixture:
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="rag-runtime",
-                provider_type="inline::rag-runtime",
-                config={},
-            ),
-            Provider(
-                provider_id="tavily-search",
-                provider_type="remote::tavily-search",
-                config={
-                    "api_key": os.environ["TAVILY_SEARCH_API_KEY"],
-                },
-            ),
-            Provider(
-                provider_id="wolfram-alpha",
-                provider_type="remote::wolfram-alpha",
-                config={
-                    "api_key": os.environ["WOLFRAM_ALPHA_API_KEY"],
-                },
-            ),
-        ],
-    )
-
-
-@pytest.fixture(scope="session")
-def tool_group_input_memory() -> ToolGroupInput:
-    return ToolGroupInput(
-        toolgroup_id="builtin::rag",
-        provider_id="rag-runtime",
-    )
-
-
-@pytest.fixture(scope="session")
-def tool_group_input_tavily_search() -> ToolGroupInput:
-    return ToolGroupInput(
-        toolgroup_id="builtin::web_search",
-        provider_id="tavily-search",
-    )
-
-
-@pytest.fixture(scope="session")
-def tool_group_input_wolfram_alpha() -> ToolGroupInput:
-    return ToolGroupInput(
-        toolgroup_id="builtin::wolfram_alpha",
-        provider_id="wolfram-alpha",
-    )
-
-
-TOOL_RUNTIME_FIXTURES = ["memory_and_search"]
-
-
-@pytest_asyncio.fixture(scope="session")
-async def tools_stack(
-    request,
-    inference_model,
-    tool_group_input_memory,
-    tool_group_input_tavily_search,
-    tool_group_input_wolfram_alpha,
-):
-    fixture_dict = request.param
-
-    providers = {}
-    provider_data = {}
-    for key in ["inference", "vector_io", "tool_runtime"]:
-        fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
-        providers[key] = fixture.providers
-        if key == "inference":
-            providers[key].append(
-                Provider(
-                    provider_id="tools_memory_provider",
-                    provider_type="inline::sentence-transformers",
-                    config={},
-                )
-            )
-        if fixture.provider_data:
-            provider_data.update(fixture.provider_data)
-    inference_models = inference_model if isinstance(inference_model, list) else [inference_model]
-    models = [
-        ModelInput(
-            model_id=model,
-            model_type=ModelType.llm,
-            provider_id=providers["inference"][0].provider_id,
-        )
-        for model in inference_models
-    ]
-    models.append(
-        ModelInput(
-            model_id="all-MiniLM-L6-v2",
-            model_type=ModelType.embedding,
-            provider_id="tools_memory_provider",
-            metadata={"embedding_dimension": 384},
-        )
-    )
-
-    test_stack = await construct_stack_for_test(
-        [
-            Api.tool_groups,
-            Api.inference,
-            Api.vector_io,
-            Api.tool_runtime,
-        ],
-        providers,
-        provider_data,
-        models=models,
-        tool_groups=[
-            tool_group_input_tavily_search,
-            tool_group_input_wolfram_alpha,
-            tool_group_input_memory,
-        ],
-    )
-    return test_stack
diff --git a/llama_stack/providers/tests/tools/test_tools.py b/llama_stack/providers/tests/tools/test_tools.py
deleted file mode 100644
index 8188f3dd7..000000000
--- a/llama_stack/providers/tests/tools/test_tools.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-import pytest
-
-from llama_stack.apis.tools import RAGDocument, RAGQueryResult, ToolInvocationResult
-from llama_stack.providers.datatypes import Api
-
-
-@pytest.fixture
-def sample_search_query():
-    return "What are the latest developments in quantum computing?"
-
-
-@pytest.fixture
-def sample_wolfram_alpha_query():
-    return "What is the square root of 16?"
-
-
-@pytest.fixture
-def sample_documents():
-    urls = [
-        "memory_optimizations.rst",
-        "chat.rst",
-        "llama3.rst",
-        "qat_finetune.rst",
-        "lora_finetune.rst",
-    ]
-    return [
-        RAGDocument(
-            document_id=f"num-{i}",
-            content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
-            mime_type="text/plain",
-            metadata={},
-        )
-        for i, url in enumerate(urls)
-    ]
-
-
-class TestTools:
-    @pytest.mark.asyncio
-    async def test_web_search_tool(self, tools_stack, sample_search_query):
-        """Test the web search tool functionality."""
-        if "TAVILY_SEARCH_API_KEY" not in os.environ:
-            pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
-
-        tools_impl = tools_stack.impls[Api.tool_runtime]
-
-        # Execute the tool
-        response = await tools_impl.invoke_tool(tool_name="web_search", kwargs={"query": sample_search_query})
-
-        # Verify the response
-        assert isinstance(response, ToolInvocationResult)
-        assert response.content is not None
-        assert len(response.content) > 0
-        assert isinstance(response.content, str)
-
-    @pytest.mark.asyncio
-    async def test_wolfram_alpha_tool(self, tools_stack, sample_wolfram_alpha_query):
-        """Test the wolfram alpha tool functionality."""
-        if "WOLFRAM_ALPHA_API_KEY" not in os.environ:
-            pytest.skip("WOLFRAM_ALPHA_API_KEY not set, skipping test")
-
-        tools_impl = tools_stack.impls[Api.tool_runtime]
-
-        response = await tools_impl.invoke_tool(tool_name="wolfram_alpha", kwargs={"query": sample_wolfram_alpha_query})
-
-        # Verify the response
-        assert isinstance(response, ToolInvocationResult)
-        assert response.content is not None
-        assert len(response.content) > 0
-        assert isinstance(response.content, str)
-
-    @pytest.mark.asyncio
-    async def test_rag_tool(self, tools_stack, sample_documents):
-        """Test the memory tool functionality."""
-        vector_dbs_impl = tools_stack.impls[Api.vector_dbs]
-        tools_impl = tools_stack.impls[Api.tool_runtime]
-
-        # Register memory bank
-        await vector_dbs_impl.register_vector_db(
-            vector_db_id="test_bank",
-            embedding_model="all-MiniLM-L6-v2",
-            embedding_dimension=384,
-            provider_id="faiss",
-        )
-
-        # Insert documents into memory
-        await tools_impl.rag_tool.insert(
-            documents=sample_documents,
-            vector_db_id="test_bank",
-            chunk_size_in_tokens=512,
-        )
-
-        # Execute the memory tool
-        response = await tools_impl.rag_tool.query(
-            content="What are the main topics covered in the documentation?",
-            vector_db_ids=["test_bank"],
-        )
-
-        # Verify the response
-        assert isinstance(response, RAGQueryResult)
-        assert response.content is not None
-        assert len(response.content) > 0
diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml
index a9c472c53..3907eba78 100644
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@@ -27,6 +27,7 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
+    - remote::wolfram-alpha
     - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py
index 2baab9d7c..3e6d1ca89 100644
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@@ -35,6 +35,7 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
+            "remote::wolfram-alpha",
             "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
@@ -77,6 +78,10 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="tavily-search",
         ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
         ToolGroupInput(
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index 0fe5f3026..359bf0194 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -86,6 +86,9 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
   - provider_id: code-interpreter
     provider_type: inline::code-interpreter
     config: {}
@@ -225,6 +228,8 @@ benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index cbe85c4f7..0ce3a4505 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -80,6 +80,9 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
   - provider_id: code-interpreter
     provider_type: inline::code-interpreter
     config: {}
@@ -214,6 +217,8 @@ benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index da33b8d53..58bd8e854 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -29,4 +29,5 @@ distribution_spec:
     - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
+    - remote::wolfram-alpha
 image_type: conda
diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py
index 2345bf3e5..16d8a259f 100644
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@@ -34,6 +34,7 @@ def get_distribution_template() -> DistributionTemplate:
             "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
+            "remote::wolfram-alpha",
         ],
     }
     name = "ollama"
@@ -78,6 +79,10 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::code_interpreter",
             provider_id="code-interpreter",
         ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index d5766dec1..c8d5a22a4 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -85,6 +85,9 @@ providers:
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
     config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
@@ -119,5 +122,7 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index a2428688e..fa21170d2 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -82,6 +82,9 @@ providers:
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
     config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
@@ -108,5 +111,7 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml
index ccb328c1c..b2bbf853a 100644
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@@ -30,4 +30,5 @@ distribution_spec:
     - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
+    - remote::wolfram-alpha
 image_type: conda
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index dd43f21f6..45af8427a 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -96,6 +96,9 @@ providers:
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
     config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
@@ -126,5 +129,7 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 24cd207c7..674085045 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -90,6 +90,9 @@ providers:
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
     config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
@@ -115,5 +118,7 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py
index 16bf1d0fa..9901fc83b 100644
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@@ -37,6 +37,7 @@ def get_distribution_template() -> DistributionTemplate:
             "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
+            "remote::wolfram-alpha",
         ],
     }
     name = "remote-vllm"
@@ -87,6 +88,10 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::code_interpreter",
             provider_id="code-interpreter",
         ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
index a8a6de28d..834a3ecaf 100644
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@@ -30,4 +30,5 @@ distribution_spec:
     - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
+    - remote::wolfram-alpha
 image_type: conda
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 26d879802..fd74f80c3 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -95,6 +95,9 @@ providers:
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
     config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
@@ -226,5 +229,7 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 0969cfe56..9a717290a 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -89,6 +89,9 @@ providers:
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
     config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
@@ -215,5 +218,7 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py
index bf6f0cea4..fce03a1b2 100644
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@@ -38,6 +38,7 @@ def get_distribution_template() -> DistributionTemplate:
             "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
+            "remote::wolfram-alpha",
         ],
     }
     name = "together"
@@ -73,6 +74,10 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::code_interpreter",
             provider_id="code-interpreter",
         ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
     ]
     embedding_model = ModelInput(
         model_id="all-MiniLM-L6-v2",
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 8e0cbdf65..dada5449f 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -20,7 +20,7 @@ from llama_stack.distribution.datatypes import Provider, StackRunConfig
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.stack import replace_env_vars
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
-from llama_stack.providers.tests.env import get_env_or_fail
+from llama_stack.env import get_env_or_fail
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 
 from .fixtures.recordable_mock import RecordableMock
@@ -84,6 +84,11 @@ def pytest_addoption(parser):
         default=None,
         help="Specify the embedding model to use for testing",
     )
+    parser.addoption(
+        "--judge-model",
+        default=None,
+        help="Specify the judge model to use for testing",
+    )
     parser.addoption(
         "--embedding-dimension",
         type=int,
@@ -109,6 +114,7 @@ def provider_data():
         "TOGETHER_API_KEY": "together_api_key",
         "ANTHROPIC_API_KEY": "anthropic_api_key",
         "GROQ_API_KEY": "groq_api_key",
+        "WOLFRAM_ALPHA_API_KEY": "wolfram_alpha_api_key",
     }
     provider_data = {}
     for key, value in keymap.items():
@@ -260,7 +266,9 @@ def inference_provider_type(llama_stack_client):
 
 
 @pytest.fixture(scope="session")
-def client_with_models(llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension):
+def client_with_models(
+    llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension, judge_model_id
+):
     client = llama_stack_client
 
     providers = [p for p in client.providers.list() if p.api == "inference"]
@@ -274,6 +282,8 @@ def client_with_models(llama_stack_client, text_model_id, vision_model_id, embed
         client.models.register(model_id=text_model_id, provider_id=inference_providers[0])
     if vision_model_id and vision_model_id not in model_ids:
         client.models.register(model_id=vision_model_id, provider_id=inference_providers[0])
+    if judge_model_id and judge_model_id not in model_ids:
+        client.models.register(model_id=judge_model_id, provider_id=inference_providers[0])
 
     if embedding_model_id and embedding_dimension and embedding_model_id not in model_ids:
         # try to find a provider that supports embeddings, if sentence-transformers is not available
@@ -328,6 +338,14 @@ def pytest_generate_tests(metafunc):
         if val is not None:
             id_parts.append(f"emb={get_short_id(val)}")
 
+    if "judge_model_id" in metafunc.fixturenames:
+        params.append("judge_model_id")
+        val = metafunc.config.getoption("--judge-model")
+        print(f"judge_model_id: {val}")
+        values.append(val)
+        if val is not None:
+            id_parts.append(f"judge={get_short_id(val)}")
+
     if "embedding_dimension" in metafunc.fixturenames:
         params.append("embedding_dimension")
         val = metafunc.config.getoption("--embedding-dimension")
diff --git a/llama_stack/providers/tests/datasetio/__init__.py b/tests/integration/datasetio/__init__.py
similarity index 100%
rename from llama_stack/providers/tests/datasetio/__init__.py
rename to tests/integration/datasetio/__init__.py
diff --git a/llama_stack/providers/tests/datasetio/test_dataset.csv b/tests/integration/datasetio/test_dataset.csv
similarity index 100%
rename from llama_stack/providers/tests/datasetio/test_dataset.csv
rename to tests/integration/datasetio/test_dataset.csv
diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py
new file mode 100644
index 000000000..899cb8c43
--- /dev/null
+++ b/tests/integration/datasetio/test_datasetio.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import mimetypes
+import os
+from pathlib import Path
+
+import pytest
+
+# How to run this test:
+#
+# pytest llama_stack/providers/tests/datasetio/test_datasetio.py
+#   -m "meta_reference"
+#   -v -s --tb=short --disable-warnings
+
+
+def data_url_from_file(file_path: str) -> str:
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type, _ = mimetypes.guess_type(file_path)
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return data_url
+
+
+def register_dataset(llama_stack_client, for_generation=False, for_rag=False, dataset_id="test_dataset"):
+    if for_rag:
+        test_file = Path(os.path.abspath(__file__)).parent / "test_rag_dataset.csv"
+    else:
+        test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv"
+    test_url = data_url_from_file(str(test_file))
+
+    if for_generation:
+        dataset_schema = {
+            "expected_answer": {"type": "string"},
+            "input_query": {"type": "string"},
+            "chat_completion_input": {"type": "chat_completion_input"},
+        }
+    elif for_rag:
+        dataset_schema = {
+            "expected_answer": {"type": "string"},
+            "input_query": {"type": "string"},
+            "generated_answer": {"type": "string"},
+            "context": {"type": "string"},
+        }
+    else:
+        dataset_schema = {
+            "expected_answer": {"type": "string"},
+            "input_query": {"type": "string"},
+            "generated_answer": {"type": "string"},
+        }
+
+    llama_stack_client.datasets.register(
+        dataset_id=dataset_id,
+        dataset_schema=dataset_schema,
+        url=dict(uri=test_url),
+        provider_id="localfs",
+    )
+
+
+def test_datasets_list(llama_stack_client):
+    # NOTE: this needs you to ensure that you are starting from a clean state
+    # but so far we don't have an unregister API unfortunately, so be careful
+
+    response = llama_stack_client.datasets.list()
+    assert isinstance(response, list)
+    assert len(response) == 0
+
+
+def test_register_dataset(llama_stack_client):
+    register_dataset(llama_stack_client)
+    response = llama_stack_client.datasets.list()
+    assert isinstance(response, list)
+    assert len(response) == 1
+    assert response[0].identifier == "test_dataset"
+
+    with pytest.raises(ValueError):
+        # unregister a dataset that does not exist
+        llama_stack_client.datasets.unregister("test_dataset2")
+
+    llama_stack_client.datasets.unregister("test_dataset")
+    response = llama_stack_client.datasets.list()
+    assert isinstance(response, list)
+    assert len(response) == 0
+
+    with pytest.raises(ValueError):
+        llama_stack_client.datasets.unregister("test_dataset")
+
+
+def test_get_rows_paginated(llama_stack_client):
+    register_dataset(llama_stack_client)
+    response = llama_stack_client.datasetio.get_rows_paginated(
+        dataset_id="test_dataset",
+        rows_in_page=3,
+    )
+    assert isinstance(response.rows, list)
+    assert len(response.rows) == 3
+    assert response.next_page_token == "3"
+
+    # iterate over all rows
+    response = llama_stack_client.datasetio.get_rows_paginated(
+        dataset_id="test_dataset",
+        rows_in_page=2,
+        page_token=response.next_page_token,
+    )
+    assert isinstance(response.rows, list)
+    assert len(response.rows) == 2
+    assert response.next_page_token == "5"
diff --git a/llama_stack/providers/tests/datasetio/test_rag_dataset.csv b/tests/integration/datasetio/test_rag_dataset.csv
similarity index 100%
rename from llama_stack/providers/tests/datasetio/test_rag_dataset.csv
rename to tests/integration/datasetio/test_rag_dataset.csv
diff --git a/llama_stack/providers/tests/eval/__init__.py b/tests/integration/eval/__init__.py
similarity index 100%
rename from llama_stack/providers/tests/eval/__init__.py
rename to tests/integration/eval/__init__.py
diff --git a/llama_stack/providers/tests/eval/constants.py b/tests/integration/eval/constants.py
similarity index 100%
rename from llama_stack/providers/tests/eval/constants.py
rename to tests/integration/eval/constants.py
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/tests/integration/eval/test_eval.py
similarity index 95%
rename from llama_stack/providers/tests/eval/test_eval.py
rename to tests/integration/eval/test_eval.py
index 4470ffe4c..a7d59a2de 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/tests/integration/eval/test_eval.py
@@ -10,15 +10,13 @@ import pytest
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
 from llama_stack.apis.eval.eval import (
-    AppBenchmarkConfig,
-    BenchmarkBenchmarkConfig,
     ModelCandidate,
 )
 from llama_stack.apis.inference import SamplingParams
 from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams
 from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset
 
+from ..datasetio.test_datasetio import register_dataset
 from .constants import JUDGE_PROMPT
 
 # How to run this test:
@@ -28,6 +26,7 @@ from .constants import JUDGE_PROMPT
 #   -v -s --tb=short --disable-warnings
 
 
+@pytest.mark.skip(reason="FIXME FIXME @yanxi0830 this needs to be migrated to use the API")
 class Testeval:
     @pytest.mark.asyncio
     async def test_benchmarks_list(self, eval_stack):
@@ -68,7 +67,7 @@ class Testeval:
             benchmark_id=benchmark_id,
             input_rows=rows.rows,
             scoring_functions=scoring_functions,
-            benchmark_config=AppBenchmarkConfig(
+            benchmark_config=dict(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -111,7 +110,7 @@ class Testeval:
         )
         response = await eval_impl.run_eval(
             benchmark_id=benchmark_id,
-            benchmark_config=AppBenchmarkConfig(
+            benchmark_config=dict(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -169,7 +168,7 @@ class Testeval:
         benchmark_id = "meta-reference-mmlu"
         response = await eval_impl.run_eval(
             benchmark_id=benchmark_id,
-            benchmark_config=BenchmarkBenchmarkConfig(
+            benchmark_config=dict(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 021b6c936..4b0d9b1c1 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -14075,7 +14075,7 @@
       {
         "event": {
           "delta": {
-            "text": " provided function definitions are",
+            "text": " provided function definitions are not suitable for",
             "type": "text"
           },
           "event_type": {
@@ -14090,7 +14090,7 @@
       {
         "event": {
           "delta": {
-            "text": " not suitable",
+            "text": " this task",
             "type": "text"
           },
           "event_type": {
@@ -14105,7 +14105,7 @@
       {
         "event": {
           "delta": {
-            "text": " for this task. Please re",
+            "text": ". Please re",
             "type": "text"
           },
           "event_type": {
@@ -14203,7 +14203,7 @@
       {
         "event": {
           "delta": {
-            "text": "get_boiling_point(liquid_name='polyjuice', celcius",
+            "text": "get_boiling_point(liquid_name='polyjuice",
             "type": "text"
           },
           "event_type": {
@@ -14218,7 +14218,7 @@
       {
         "event": {
           "delta": {
-            "text": "=True)]",
+            "text": "', celcius=True)]",
             "type": "text"
           },
           "event_type": {
@@ -14242,7 +14242,7 @@
                 "celcius": true,
                 "liquid_name": "polyjuice"
               },
-              "call_id": "3cb5e131-c553-494b-ae31-7d3836fbb4d8",
+              "call_id": "1fc2d874-894e-4857-ae2b-7aacc75c330e",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -14315,7 +14315,7 @@
       {
         "event": {
           "delta": {
-            "text": " function call returned an",
+            "text": " function call returned an error",
             "type": "text"
           },
           "event_type": {
@@ -14330,7 +14330,7 @@
       {
         "event": {
           "delta": {
-            "text": " error since \"",
+            "text": " since \"polyjuice\" is",
             "type": "text"
           },
           "event_type": {
@@ -14345,7 +14345,7 @@
       {
         "event": {
           "delta": {
-            "text": "polyjuice\" is not a real liquid. Polyju",
+            "text": " not a real liquid. Polyjuice is a fictional substance",
             "type": "text"
           },
           "event_type": {
@@ -14360,7 +14360,7 @@
       {
         "event": {
           "delta": {
-            "text": "ice is a fictional substance from the",
+            "text": " from the Harry Potter series. The boiling",
             "type": "text"
           },
           "event_type": {
@@ -14375,7 +14375,7 @@
       {
         "event": {
           "delta": {
-            "text": " Harry Potter series. The boiling",
+            "text": " point of a liquid is a physical",
             "type": "text"
           },
           "event_type": {
@@ -14390,7 +14390,7 @@
       {
         "event": {
           "delta": {
-            "text": " point of a substance is a physical",
+            "text": " property that can be measured and",
             "type": "text"
           },
           "event_type": {
@@ -14405,7 +14405,7 @@
       {
         "event": {
           "delta": {
-            "text": " property that can be measured, but it",
+            "text": " quantified, but it only applies",
             "type": "text"
           },
           "event_type": {
@@ -14420,52 +14420,7 @@
       {
         "event": {
           "delta": {
-            "text": " only applies to real substances. If you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'d like to know the boiling point of a different",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " liquid, I can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " try to help with that.",
+            "text": " to real substances that exist in the physical world.",
             "type": "text"
           },
           "event_type": {
@@ -14533,7 +14488,7 @@
       {
         "event": {
           "delta": {
-            "text": "get_boiling_point(liquid",
+            "text": "get_boiling_point(liquid_name='polyjuice",
             "type": "text"
           },
           "event_type": {
@@ -14548,22 +14503,7 @@
       {
         "event": {
           "delta": {
-            "text": "_name='polyjuice', celcius=True",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ")]",
+            "text": "', celcius=True)]",
             "type": "text"
           },
           "event_type": {
@@ -14587,7 +14527,7 @@
                 "celcius": true,
                 "liquid_name": "polyjuice"
               },
-              "call_id": "4c62a314-448c-4cd5-a921-610583007faa",
+              "call_id": "7d72d1ae-9f52-40c7-8dc5-48fff52b253a",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -14660,7 +14600,7 @@
       {
         "event": {
           "delta": {
-            "text": " I answered the",
+            "text": " I answered the phone, the friendly",
             "type": "text"
           },
           "event_type": {
@@ -14675,22 +14615,7 @@
       {
         "event": {
           "delta": {
-            "text": " phone, the friendly voice on the other end said",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"hello\" and asked how I was doing",
+            "text": " voice on the other end said \"hello\" and asked how I was doing",
             "type": "text"
           },
           "event_type": {
@@ -14833,7 +14758,7 @@
       {
         "event": {
           "delta": {
-            "text": " the file path is correct",
+            "text": " the file path is correct and the file exists in the specified location. If",
             "type": "text"
           },
           "event_type": {
@@ -14848,7 +14773,7 @@
       {
         "event": {
           "delta": {
-            "text": " and the file exists in the specified location. If",
+            "text": " the file is located in a different directory, you should",
             "type": "text"
           },
           "event_type": {
@@ -14863,7 +14788,7 @@
       {
         "event": {
           "delta": {
-            "text": " the file is located in a different directory,",
+            "text": " provide the correct file path.\n\nAdditionally, you can",
             "type": "text"
           },
           "event_type": {
@@ -14878,7 +14803,7 @@
       {
         "event": {
           "delta": {
-            "text": " you should provide the correct path to the",
+            "text": " use the `os` module to check if the file exists before attempting",
             "type": "text"
           },
           "event_type": {
@@ -14893,7 +14818,7 @@
       {
         "event": {
           "delta": {
-            "text": " file.\n\nAdditionally, you can use the `os`",
+            "text": " to read it. Here",
             "type": "text"
           },
           "event_type": {
@@ -14908,7 +14833,7 @@
       {
         "event": {
           "delta": {
-            "text": " module to check if the file exists before attempting to",
+            "text": "'s an example:\n\n```python\nimport os\nimport",
             "type": "text"
           },
           "event_type": {
@@ -14923,7 +14848,7 @@
       {
         "event": {
           "delta": {
-            "text": " read it. Here's an example:\n\n```",
+            "text": " pandas as pd\n\nfile_path = \"/var/folders",
             "type": "text"
           },
           "event_type": {
@@ -14938,7 +14863,7 @@
       {
         "event": {
           "delta": {
-            "text": "python\nimport os\nimport pandas as pd\n\nfile",
+            "text": "/rb/qv8vwgyj6yjd3t4p",
             "type": "text"
           },
           "event_type": {
@@ -14953,7 +14878,7 @@
       {
         "event": {
           "delta": {
-            "text": "_path",
+            "text": "wsy9t0rm0000gn/T/tmp4n_d_h",
             "type": "text"
           },
           "event_type": {
@@ -14968,7 +14893,7 @@
       {
         "event": {
           "delta": {
-            "text": " = \"/var/folders/rb/qvq",
+            "text": "5o/u4yh2j11inflation.csv\"\n\nif",
             "type": "text"
           },
           "event_type": {
@@ -14983,7 +14908,7 @@
       {
         "event": {
           "delta": {
-            "text": "vwgyj6yjd3t4pwsy9t0",
+            "text": " os.path.isfile(file_path):\n    df =",
             "type": "text"
           },
           "event_type": {
@@ -14998,7 +14923,7 @@
       {
         "event": {
           "delta": {
-            "text": "rm0000gn/T/tmpdcpkc9",
+            "text": " pd.read_csv(file_path)\n    print",
             "type": "text"
           },
           "event_type": {
@@ -15013,37 +14938,7 @@
       {
         "event": {
           "delta": {
-            "text": "_f/15dhK1rDinflation.csv\"\n\nif",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " os.path.isfile(file_path):\n    df = pd.read_csv(file_path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ")\n    print(\"Number of rows and columns in the",
+            "text": "(\"Number of rows and columns in the",
             "type": "text"
           },
           "event_type": {
@@ -15262,7 +15157,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "qvwgyj6yjd3",
+            "tool_call": "8vwgyj6yjd3t4pwsy9t",
             "type": "tool_call"
           },
           "event_type": {
@@ -15281,7 +15176,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "t4pwsy9t0rm0000gn/T/tmpd",
+            "tool_call": "0rm0000gn/T/tmp4n_d_h5o/u4",
             "type": "tool_call"
           },
           "event_type": {
@@ -15300,7 +15195,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "cpkc9_f/15dhK1rDinflation.csv\")\n",
+            "tool_call": "yh2j11inflation.csv\")\n# Rows\nprint(\"Number of",
             "type": "tool_call"
           },
           "event_type": {
@@ -15319,7 +15214,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "# Rows\nprint(\"Number of rows and columns in the data:\", df",
+            "tool_call": " rows and columns in the data:\", df.shape)\n# Columns\nprint(\"",
             "type": "tool_call"
           },
           "event_type": {
@@ -15338,7 +15233,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".shape)\n# Columns\nprint(\"Columns of the data are:\", len(df",
+            "tool_call": "Columns of the data are:\", len(df.columns))\n# Column names\nprint",
             "type": "tool_call"
           },
           "event_type": {
@@ -15357,7 +15252,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".columns))\n# Column names\nprint(\"Columns of the data are:\", df",
+            "tool_call": "(\"Columns of the data are:\", df.columns)\n# Column dtypes\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -15376,7 +15271,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".columns)\n# Column dtypes\nprint(\"Datatype of the columns are",
+            "tool_call": "print(\"Datatype of the columns are:\", df.dtypes)\n#",
             "type": "tool_call"
           },
           "event_type": {
@@ -15395,26 +15290,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ":\", df.dtypes)\n# Sample of data\nprint",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(\"Data sample from file:\")\nprint(df.head())",
+            "tool_call": " Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())",
             "type": "tool_call"
           },
           "event_type": {
@@ -15435,9 +15311,866 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qvqvwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpdcpkc9_f/15dhK1rDinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())"
+                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp4n_d_h5o/u4yh2j11inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())"
               },
-              "call_id": "bdb9c5e1-2082-49c8-ab7a-15aae2135656",
+              "call_id": "517038eb-c373-441b-96fe-3a0e2f063fc0",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file being moved to a different location.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does exist, you can try to load it using the correct path. If the file does not exist, you will need to create it or obtain it from the relevant source.\\n\\nHere is an example of how you can modify the code to handle this situation:\\n\\n```\\nimport pandas as pd\\n\\n# Define the path to the CSV file\\nfile_path = \"<TEMP_FILE>\"\\n\\n# Check if the file exists\\nimport os\\nif os.path.isfile(file_path):\\n    # Load the CSV file\\n    df = pd.read_csv(file_path)\\n\\n    # Print the first few rows of the dataframe\\n    print(df.head())\\n\\n    # Print information about the dataframe\\n    print(df.info())\\n\\n    # Print summary statistics about the dataframe\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist.\")\\n```\\n\\nThis code will check if the file exists before attempting to load it. If the file does not exist, it will print a message indicating that the file does not exist.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " error message indicates that the file \"/var/folders/rb/qv8",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "vwgyj6yjd3t4pwsy9t0",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "rm0000gn/T/tmpbb210725/duWDtjG",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ninflation.csv\" does not exist. This could be due to a number",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " of reasons such as the file being deleted, the path being incorrect, or",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the file being moved to a different location.\n\nTo resolve this issue, you",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " should ensure that the file exists and the path is correct. If the file",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " does exist, you can try to load it using the correct path. If",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the file does not exist, you will need to create it or obtain it",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " from the relevant source.\n\nHere is an example of how you can modify the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " code to handle this situation:\n\n```\nimport pandas as pd\nimport matplotlib",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".pyplot as plt\n\n# Define the path to the CSV file\nfile_path",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " = \"/var/folders/rb/qv8vwgyj6y",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "jd3t4pwsy9t0rm0000gn/T",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "/tmpbb210725/duWDtjGninflation.csv\"\n\n#",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Check if the file exists\nimport os\nif os.path.isfile(file_path",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "):\n    # Load the CSV file\n    df = pd.read_csv(file",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_path)\n\n    # Convert the 'Year' column to datetime\n    df",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "    # Group by 'Year' and calculate the average inflation\n    df",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_avg_inflation = df.groupby('Year')['Inflation'].mean().reset",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_index()\n\n    # Plot the average yearly inflation as a time series\n   ",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " plt.figure(figsize=(10,6))\n    plt.plot(df_avg_inflation",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "['Year'], df_avg_inflation['Inflation'], marker='o')\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "    plt.title('Average Yearly Inflation')\n    plt.xlabel('Year",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "')\n    plt.ylabel('Inflation')\n    plt.grid(True)\n    plt",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".show()\nelse:\n    print(\"The file does not exist.\")\n```\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "This code will check if the file exists before attempting to load it. If",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the file does not exist, it will print a message indicating that the file",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " does not exist.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file being moved to a different location.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does exist, you can try to load it using the correct path. If the file does not exist, you will need to create it or obtain it from the relevant source.\\n\\nHere is an example of how you can modify the code to handle this situation:\\n\\n```\\nimport pandas as pd\\n\\n# Define the path to the CSV file\\nfile_path = \"<TEMP_FILE>\"\\n\\n# Check if the file exists\\nimport os\\nif os.path.isfile(file_path):\\n    # Load the CSV file\\n    df = pd.read_csv(file_path)\\n\\n    # Print the first few rows of the dataframe\\n    print(df.head())\\n\\n    # Print information about the dataframe\\n    print(df.info())\\n\\n    # Print summary statistics about the dataframe\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist.\")\\n```\\n\\nThis code will check if the file exists before attempting to load it. If the file does not exist, it will print a message indicating that the file does not exist.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "8vwgyj6yjd3t4pwsy9t",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "0rm0000gn/T/tmpbb210725/duWDtj",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "Gninflation.csv\")\n\n# Convert the 'Year' column to datetime\n",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "df['Year'] = pd.to_datetime(df['Year'], format",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "='%Y')\n\n# Group by 'Year' and calculate",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " the average inflation\ndf_avg_inflation = df.groupby('",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " inflation as a time series\nplt.figure(figsize=(10",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ",6))\nplt.plot(df_avg_inflation['Year'], df_avg_in",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "flation['Inflation'], marker='o')\nplt.title('Average Yearly",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ".grid(True)\nplt.show()",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpbb210725/duWDtjGninflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()"
+              },
+              "call_id": "a6646608-a943-4849-884e-1852d5ef4a7e",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -18060,6 +18793,494 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " error message indicates that the file \"/var/folders/rb/qv8",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "vwgyj6yjd3t4pwsy9t0",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "rm0000gn/T/tmpbb210725/duWDtjG",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ninflation.csv\" does not exist. This could be",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " due to a number of reasons such as the file being deleted,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the path being",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " incorrect, or the file being moved to a different location.\n\nTo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " resolve this issue, you should ensure that",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the file exists and the path is correct. If the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " file does exist, you can try to load",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " it using the correct path. If the file does",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not exist, you will need to create it or obtain",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " it from the relevant",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " source.\n\nHere is an example of",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " how you can modify the code to handle this situation:\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "```\nimport pandas as pd\n\n# Define the path to the CSV file",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\nfile_path = \"/var/folders/rb/qv8",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "vwgyj6yjd3t4pwsy9t0",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "rm0000gn/T/tmpbb210725/duWDtjG",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ninflation.csv\"\n\n# Check if the file exists\nimport os",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\nif os.path.isfile(file_path):\n    # Load",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the CSV file\n    df = pd.read_csv(file_path)\n\n   ",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " # Print the first few rows of the dataframe\n    print(df.head())\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "    # Print information about",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the dataframe\n    print(df.info())\n\n    # Print summary statistics about the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " dataframe\n    print(df.describe())\nelse:\n    print(\"The file does",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not exist.\")\n```\n\nThis code will check if the file exists before",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " attempting to load it. If the file does not exist, it will print",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " a message indicating that the file does not exist.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -19079,7 +20300,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read",
+            "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders",
             "type": "tool_call"
           },
           "event_type": {
@@ -19098,7 +20319,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y",
+            "tool_call": "/rb/qv8vwgyj6yjd3t4pwsy9t0",
             "type": "tool_call"
           },
           "event_type": {
@@ -19117,7 +20338,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "jd3t4pwsy9t0rm0000gn/T",
+            "tool_call": "rm0000gn/T/tmpbb210725/duWDtjGninflation.csv\")\n\n#",
             "type": "tool_call"
           },
           "event_type": {
@@ -19136,7 +20357,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "/tmpdcpkc9_f/FKWQnYoVinflation.csv\")\n\n",
+            "tool_call": " Print the first few rows of the dataframe\nprint(df.head())\n\n#",
             "type": "tool_call"
           },
           "event_type": {
@@ -19155,7 +20376,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "# Print the first few rows of the dataframe\n",
+            "tool_call": " Print information about",
             "type": "tool_call"
           },
           "event_type": {
@@ -19174,7 +20395,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "print(df.head())\n\n# Print information about",
+            "tool_call": " the dataframe\nprint(df.info())\n\n# Print summary statistics about the",
             "type": "tool_call"
           },
           "event_type": {
@@ -19193,45 +20414,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " the dataframe\nprint(df",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".info())\n\n# Print summary statistics of the dataframe\nprint(df.describe",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "())",
+            "tool_call": " dataframe\nprint(df.describe())",
             "type": "tool_call"
           },
           "event_type": {
@@ -19252,9 +20435,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpdcpkc9_f/FKWQnYoVinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())"
+                "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpbb210725/duWDtjGninflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())"
               },
-              "call_id": "4208ff16-c9e6-4754-8566-8aeb587afcb3",
+              "call_id": "3ab348fd-a9b8-47d7-be10-7d38159c9a0d",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -19887,6 +21070,673 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f76dc\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:de2d4\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:de2d4\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help. What's your first question about Torchtune?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'using LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:c4fc3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:c4fc3\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "To",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " use LoRA in Torchtune, you can follow",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " these steps:\n\n1. Import the necessary modules: `",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "from torch",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "tune.models.llama2 import llama2_7b",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ", lora_llama2_7b`\n2. Create a",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Llama2 model with LoRA: `lora",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_model = lora_ll",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ama2_7b",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "(lora_attn_modules=[\"q_proj\", \"v_proj\"])`\n3",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ". Load the pre-trained Llama2 weights into",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the LoRA model: `lora_model.load_state",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_dict(base_model.state_dict(), strict=False)`\n4. Set only Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA parameters to trainable: `from torchtune.modules.peft",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".peft_utils import get_adapter_params,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " set_trainable_params`\n5. Run the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " LoRA finetune using torchtune's Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA recipe: `tune run --",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "nnodes 1 --nproc_per_node 2 lora_finet",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "une_distributed --config llama2/7B_lora`\n\nYou can",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " also experiment with different LoRA configurations, such as applying Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA to all linear layers in the self-attention, increasing the rank,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " and scaling alpha and rank together.\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "Note: You need to have the pre-trained",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Llama2 weights and tokenizer downloaded and installed before running the LoRA fin",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "etune. Additionally, you can use torch",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "tune's `WandBLogger` to generate",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " loss curves and track the experiment's",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " progress.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f76dc\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:de2d4\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:de2d4\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help. What's your first question about Torchtune?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "[k",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "nowledge_search(query=\"using LoRA in Torchtune",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\")]",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "using LoRA in Torchtune"
+              },
+              "call_id": "8413a252-8372-4061-a4a1-0a1d165dd373",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f76dc\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:de2d4\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:de2d4\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "I",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "'m ready to help. What's",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " your first question about Torchtune?",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -19945,7 +21795,7 @@
               "arguments": {
                 "query": "Torchtune documentation"
               },
-              "call_id": "42e0a687-a52e-4208-8181-db6e7a84faeb",
+              "call_id": "f21015ed-e70b-4a2b-a038-9335acbe0c53",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -20282,7 +22132,22 @@
       {
         "event": {
           "delta": {
-            "text": " the standard multi-head attention.",
+            "text": " the standard",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " multi-head attention.",
             "type": "text"
           },
           "event_type": {
@@ -20350,7 +22215,7 @@
       {
         "event": {
           "delta": {
-            "text": "nowledge_search(query=\"Llama3-8",
+            "text": "nowledge_search(query=\"Llama",
             "type": "text"
           },
           "event_type": {
@@ -20365,7 +22230,7 @@
       {
         "event": {
           "delta": {
-            "text": "B attention type\")]",
+            "text": "3-8B attention type\")]",
             "type": "text"
           },
           "event_type": {
@@ -20388,7 +22253,7 @@
               "arguments": {
                 "query": "Llama3-8B attention type"
               },
-              "call_id": "b3019313-870b-42e5-a2a3-02f933f153b1",
+              "call_id": "bf3bf9f9-0e56-4720-a6a9-be8ad9e8dfcb",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -20461,7 +22326,7 @@
       {
         "event": {
           "delta": {
-            "text": "nowledge_search(query=\"Llama3-8B attention",
+            "text": "nowledge_search(query=\"Llama",
             "type": "text"
           },
           "event_type": {
@@ -20476,7 +22341,7 @@
       {
         "event": {
           "delta": {
-            "text": " type\")]",
+            "text": "3-8B attention type\")]",
             "type": "text"
           },
           "event_type": {
@@ -20499,7 +22364,7 @@
               "arguments": {
                 "query": "Llama3-8B attention type"
               },
-              "call_id": "e4659511-69a4-412b-b995-fa90f43a25c7",
+              "call_id": "9c9a922f-afd6-4bc8-83ba-28211bb3fd29",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -20739,7 +22604,7 @@
               "arguments": {
                 "query": "current CEO of Meta"
               },
-              "call_id": "ccadcdbb-cfa1-4f69-9c60-0fc50ae35f11",
+              "call_id": "2039dce8-afbe-4517-bb4a-43c92dab8cff",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "brave_search"
@@ -20815,7 +22680,22 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point of polyjuice is -100\u00b0C.",
+            "text": " boiling point of polyjuice is",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " -100\u00b0C.",
             "type": "text"
           },
           "event_type": {
@@ -20951,7 +22831,157 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point of polyjuice is -100 degrees Celsius.",
+            "text": " provided function \"get_boiling_point\" is",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not sufficient to",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " answer the question as it does not contain information",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " about the boiling point of \"poly",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "juice\". Polyjuice is not a",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " real liquid and does",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not have a known boiling point. If you",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " have any other questions or need",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " information about a different liquid,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " I would be happy to try and",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " assist you.",
             "type": "text"
           },
           "event_type": {
@@ -21019,7 +23049,7 @@
       {
         "event": {
           "delta": {
-            "text": "get_boiling_point(liquid_name='polyjuice",
+            "text": "get",
             "type": "text"
           },
           "event_type": {
@@ -21034,7 +23064,22 @@
       {
         "event": {
           "delta": {
-            "text": "', celcius=True)]",
+            "text": "_boiling_point(liquid",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_name='polyjuice', celcius=True)]",
             "type": "text"
           },
           "event_type": {
@@ -21058,7 +23103,7 @@
                 "celcius": true,
                 "liquid_name": "polyjuice"
               },
-              "call_id": "cbea2158-ad0a-4faf-a2ec-3e411bd5aa50",
+              "call_id": "302993c2-3c56-48cf-8891-afac1f20723e",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -21170,7 +23215,7 @@
                 "celcius": true,
                 "liquid_name": "polyjuice"
               },
-              "call_id": "ac3bf39b-16e7-46e9-a243-130939094e24",
+              "call_id": "9544e61b-5e69-427b-b30c-874fdbcf53f7",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -21273,7 +23318,7 @@
       {
         "event": {
           "delta": {
-            "text": "'s not a real substance, it doesn't have a boiling point",
+            "text": "'s not a real substance, it doesn",
             "type": "text"
           },
           "event_type": {
@@ -21288,7 +23333,7 @@
       {
         "event": {
           "delta": {
-            "text": ". Polyjuice Potion is a magical concoction",
+            "text": "'t have a boiling point. Polyjuice Potion is",
             "type": "text"
           },
           "event_type": {
@@ -21303,7 +23348,7 @@
       {
         "event": {
           "delta": {
-            "text": " that allows the drinker to assume the form and",
+            "text": " a magical concoction that allows the drinker to assume the",
             "type": "text"
           },
           "event_type": {
@@ -21318,7 +23363,7 @@
       {
         "event": {
           "delta": {
-            "text": " appearance of another person, but",
+            "text": " form and appearance of another person, but it's not",
             "type": "text"
           },
           "event_type": {
@@ -21333,7 +23378,7 @@
       {
         "event": {
           "delta": {
-            "text": " it's not a physical substance that can",
+            "text": " a physical substance that can be measured or analyzed in the same",
             "type": "text"
           },
           "event_type": {
@@ -21348,7 +23393,7 @@
       {
         "event": {
           "delta": {
-            "text": " be measured or analyzed in the same way as real-world",
+            "text": " way as real-world chemicals.\n\nIf",
             "type": "text"
           },
           "event_type": {
@@ -21363,7 +23408,7 @@
       {
         "event": {
           "delta": {
-            "text": " chemicals.\n\nIf you have any other questions or if there",
+            "text": " you have any other questions or if there's anything else I can help you",
             "type": "text"
           },
           "event_type": {
@@ -21378,22 +23423,7 @@
       {
         "event": {
           "delta": {
-            "text": "'s anything else I can help you with, feel free to ask",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "!",
+            "text": " with, feel free to ask!",
             "type": "text"
           },
           "event_type": {
@@ -21500,7 +23530,7 @@
                 "celcius": true,
                 "liquid_name": "polyjuice"
               },
-              "call_id": "fc32cf9f-db3f-42a8-baad-da88903b53be",
+              "call_id": "ce595f0c-86f3-4055-b675-09e00007dc97",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -21656,7 +23686,7 @@
       {
         "event": {
           "delta": {
-            "text": " 100th prime number is ",
+            "text": " 100th prime number is 541",
             "type": "text"
           },
           "event_type": {
@@ -21671,7 +23701,7 @@
       {
         "event": {
           "delta": {
-            "text": "541.",
+            "text": ".",
             "type": "text"
           },
           "event_type": {
@@ -21766,7 +23796,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\n    if n <= 3:\n        return True",
+            "tool_call": "\n    if n <=",
             "type": "tool_call"
           },
           "event_type": {
@@ -21785,7 +23815,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\n    if n % 2 == 0 or n %",
+            "tool_call": " ",
             "type": "tool_call"
           },
           "event_type": {
@@ -21804,7 +23834,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 3 == 0:\n       ",
+            "tool_call": "3:\n        return True\n    if n % 2 == 0",
             "type": "tool_call"
           },
           "event_type": {
@@ -21823,7 +23853,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " return False\n    i",
+            "tool_call": " or n % 3 == 0:\n        return False\n    i",
             "type": "tool_call"
           },
           "event_type": {
@@ -21842,7 +23872,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " = 5\n    while i * i <= n:\n        if n",
+            "tool_call": " = 5\n    while i * i <=",
             "type": "tool_call"
           },
           "event_type": {
@@ -21861,7 +23891,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " % i == 0 or n % (i + 2) ==",
+            "tool_call": " n:\n        if n % i == 0 or n % (i",
             "type": "tool_call"
           },
           "event_type": {
@@ -21880,7 +23910,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 0:\n            return False\n        i +=",
+            "tool_call": " + 2) == 0:\n            return False\n        i +=",
             "type": "tool_call"
           },
           "event_type": {
@@ -21899,7 +23929,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 6\n    return",
+            "tool_call": " 6\n    return True\n\ndef nth_prime(n):\n    count =",
             "type": "tool_call"
           },
           "event_type": {
@@ -21918,7 +23948,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " True\n\ndef nth_prime(n):\n    count = ",
+            "tool_call": " 0\n    num = 2\n    while True:\n        if",
             "type": "tool_call"
           },
           "event_type": {
@@ -21937,45 +23967,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "0\n    num = 2\n   ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " while True:\n        if is_prime(num):\n           ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " count += 1\n            if count == n",
+            "tool_call": " is_prime(num):\n            count += 1\n            if count == n",
             "type": "tool_call"
           },
           "event_type": {
@@ -22036,7 +24028,7 @@
               "arguments": {
                 "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(nth_prime(100))"
               },
-              "call_id": "11645d4d-35d0-4542-bc8d-d01ed1758163",
+              "call_id": "63d06ce7-5266-4ee8-a620-0e81cf5108a1",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -22112,7 +24104,22 @@
       {
         "event": {
           "delta": {
-            "text": "plexity the company was founded in 2022.",
+            "text": "plexity the company was founded in 202",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2.",
             "type": "text"
           },
           "event_type": {
@@ -22218,7 +24225,7 @@
               "arguments": {
                 "query": "Perplexity the company founding date"
               },
-              "call_id": "42bca45b-e3d6-40a8-b110-d9d77328089e",
+              "call_id": "3804eaba-07f8-448c-8dd4-8ee14d748a05",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -22306,7 +24313,7 @@
       {
         "event": {
           "delta": {
-            "text": " the merger of the Basketball Association of America (BAA) and the National",
+            "text": " the merger of the Basketball",
             "type": "text"
           },
           "event_type": {
@@ -22321,7 +24328,22 @@
       {
         "event": {
           "delta": {
-            "text": " Basketball League (NBL).",
+            "text": " Association of America (BAA) and the National Basketball League (NBL",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ").",
             "type": "text"
           },
           "event_type": {
@@ -22412,7 +24434,7 @@
               "arguments": {
                 "query": "NBA creation date"
               },
-              "call_id": "bc879653-70ed-4c38-8a7f-fa8a4621b088",
+              "call_id": "d94006c1-5692-4ada-8f1a-d09ef2d46dab",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.pickle b/tests/integration/fixtures/recorded_responses/chat_completion.pickle
index 4abc0c17e..c4f1c7efd 100644
Binary files a/tests/integration/fixtures/recorded_responses/chat_completion.pickle and b/tests/integration/fixtures/recorded_responses/chat_completion.pickle differ
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 7d56a829a..77995f72f 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -44,6 +44,15 @@
       "metadata": null
     }
   },
+  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
+    "type": "value",
+    "value": {
+      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
+      "error_code": null,
+      "error_message": null,
+      "metadata": null
+    }
+  },
   "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
     "type": "value",
     "value": {
@@ -71,15 +80,6 @@
       "metadata": null
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation[\\'Year\\'], average_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation Rate\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
   "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
     "type": "value",
     "value": {
@@ -107,23 +107,23 @@
           "type": "text"
         },
         {
-          "text": "Result 1:\nDocument_id:64211\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "text": "Result 1:\nDocument_id:cbc88\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
           "type": "text"
         },
         {
-          "text": "Result 2:\nDocument_id:64211\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "text": "Result 2:\nDocument_id:cbc88\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
           "type": "text"
         },
         {
-          "text": "Result 3:\nDocument_id:0c95c\nContent:  with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n  LoRA to:\n\n  * ``q_proj`` applies LoRA to the query projection layer.\n  * ``k_proj`` applies LoRA to the key projection layer.\n  * ``v_proj`` applies LoRA to the value projection layer.\n  * ``output_proj`` applies LoRA to the attention output projection layer.\n\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\n  this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n  This is usually a projection to vocabulary space (e.g. in language models), but\n  other modelling tasks may have different projections - classifier models will project\n  to the number of classes, for example\n\n.. note::\n\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n  final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.llama3.lora_llama3_8b\n    apply_lora_to_mlp: True\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n",
+          "text": "Result 3:\nDocument_id:8892b\nContent:  with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n  LoRA to:\n\n  * ``q_proj`` applies LoRA to the query projection layer.\n  * ``k_proj`` applies LoRA to the key projection layer.\n  * ``v_proj`` applies LoRA to the value projection layer.\n  * ``output_proj`` applies LoRA to the attention output projection layer.\n\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\n  this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n  This is usually a projection to vocabulary space (e.g. in language models), but\n  other modelling tasks may have different projections - classifier models will project\n  to the number of classes, for example\n\n.. note::\n\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n  final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.llama3.lora_llama3_8b\n    apply_lora_to_mlp: True\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n",
           "type": "text"
         },
         {
-          "text": "Result 4:\nDocument_id:64211\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+          "text": "Result 4:\nDocument_id:cbc88\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
           "type": "text"
         },
         {
-          "text": "Result 5:\nDocument_id:1d70c\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n        checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n",
+          "text": "Result 5:\nDocument_id:9dcb7\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n        checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n",
           "type": "text"
         },
         {
@@ -135,11 +135,11 @@
       "error_message": null,
       "metadata": {
         "document_ids": [
-          "6421150d-d334-4163-a058-3818b2b742e9",
-          "6421150d-d334-4163-a058-3818b2b742e9",
-          "0c95cff3-5612-40cf-a73d-77644a2462d0",
-          "6421150d-d334-4163-a058-3818b2b742e9",
-          "1d70c86d-4cdf-4be9-a1f2-8a271b15ce2c"
+          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
+          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
+          "8892b092-6394-471e-b143-a23c6cc374f8",
+          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
+          "9dcb747d-0627-40cc-a23c-0bee2b6b05af"
         ]
       }
     }
@@ -307,23 +307,23 @@
           "type": "text"
         },
         {
-          "text": "Result 1:\nDocument_id:7bdfa\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+          "text": "Result 1:\nDocument_id:f76dc\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
           "type": "text"
         },
         {
-          "text": "Result 2:\nDocument_id:64211\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "text": "Result 2:\nDocument_id:c4fc3\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
           "type": "text"
         },
         {
-          "text": "Result 3:\nDocument_id:0c95c\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+          "text": "Result 3:\nDocument_id:de2d4\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
           "type": "text"
         },
         {
-          "text": "Result 4:\nDocument_id:64211\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "text": "Result 4:\nDocument_id:c4fc3\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
           "type": "text"
         },
         {
-          "text": "Result 5:\nDocument_id:0c95c\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+          "text": "Result 5:\nDocument_id:de2d4\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
           "type": "text"
         },
         {
@@ -335,11 +335,11 @@
       "error_message": null,
       "metadata": {
         "document_ids": [
-          "7bdfad34-d546-4e98-9757-a0289696cd97",
-          "6421150d-d334-4163-a058-3818b2b742e9",
-          "0c95cff3-5612-40cf-a73d-77644a2462d0",
-          "6421150d-d334-4163-a058-3818b2b742e9",
-          "0c95cff3-5612-40cf-a73d-77644a2462d0"
+          "f76dc7f5-9648-4272-a579-c8387fb1408a",
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
+          "de2d49de-55de-44dd-9bca-6f4f6d633b0a",
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
+          "de2d49de-55de-44dd-9bca-6f4f6d633b0a"
         ]
       }
     }
@@ -362,23 +362,23 @@
           "type": "text"
         },
         {
-          "text": "Result 1:\nDocument_id:7da0c\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "text": "Result 1:\nDocument_id:c4fc3\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
           "type": "text"
         },
         {
-          "text": "Result 2:\nDocument_id:7da0c\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+          "text": "Result 2:\nDocument_id:c4fc3\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
           "type": "text"
         },
         {
-          "text": "Result 3:\nDocument_id:7da0c\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "text": "Result 3:\nDocument_id:c4fc3\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
           "type": "text"
         },
         {
-          "text": "Result 4:\nDocument_id:7da0c\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+          "text": "Result 4:\nDocument_id:c4fc3\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
           "type": "text"
         },
         {
-          "text": "Result 5:\nDocument_id:7da0c\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+          "text": "Result 5:\nDocument_id:c4fc3\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
           "type": "text"
         },
         {
@@ -390,11 +390,11 @@
       "error_message": null,
       "metadata": {
         "document_ids": [
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f",
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f",
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f",
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f",
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f"
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
+          "c4fc3cb6-6172-489e-90a7-b39d343e14c0"
         ]
       }
     }
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle
index bb7fb29d7..33bccd4d3 100644
Binary files a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle and b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle differ
diff --git a/llama_stack/providers/tests/post_training/__init__.py b/tests/integration/post_training/__init__.py
similarity index 100%
rename from llama_stack/providers/tests/post_training/__init__.py
rename to tests/integration/post_training/__init__.py
diff --git a/llama_stack/providers/tests/post_training/test_post_training.py b/tests/integration/post_training/test_post_training.py
similarity index 97%
rename from llama_stack/providers/tests/post_training/test_post_training.py
rename to tests/integration/post_training/test_post_training.py
index aefef5332..3e22bc5a7 100644
--- a/llama_stack/providers/tests/post_training/test_post_training.py
+++ b/tests/integration/post_training/test_post_training.py
@@ -26,6 +26,7 @@ from llama_stack.apis.post_training import (
 #   -v -s --tb=short --disable-warnings
 
 
+@pytest.mark.skip(reason="FIXME FIXME @yanxi0830 this needs to be migrated to use the API")
 class TestPostTraining:
     @pytest.mark.asyncio
     async def test_supervised_fine_tune(self, post_training_stack):
diff --git a/tests/integration/report.py b/tests/integration/report.py
index 762a7afcb..fd6c4f7a8 100644
--- a/tests/integration/report.py
+++ b/tests/integration/report.py
@@ -16,6 +16,7 @@ import pytest
 from pytest import CollectReport
 from termcolor import cprint
 
+from llama_stack.env import get_env_or_fail
 from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.models.llama.sku_list import (
     all_registered_models,
@@ -26,7 +27,6 @@ from llama_stack.models.llama.sku_list import (
     safety_models,
 )
 from llama_stack.providers.datatypes import Api
-from llama_stack.providers.tests.env import get_env_or_fail
 
 from .metadata import API_MAPS
 
diff --git a/llama_stack/providers/tests/scoring/__init__.py b/tests/integration/scoring/__init__.py
similarity index 100%
rename from llama_stack/providers/tests/scoring/__init__.py
rename to tests/integration/scoring/__init__.py
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
new file mode 100644
index 000000000..b695c2ef7
--- /dev/null
+++ b/tests/integration/scoring/test_scoring.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+
+from ..datasetio.test_datasetio import register_dataset
+
+
+@pytest.fixture
+def sample_judge_prompt_template():
+    return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
+
+
+def test_scoring_functions_list(llama_stack_client):
+    # NOTE: this needs you to ensure that you are starting from a clean state
+    # but so far we don't have an unregister API unfortunately, so be careful
+    response = llama_stack_client.scoring_functions.list()
+    assert isinstance(response, list)
+    assert len(response) > 0
+
+
+def test_scoring_score(llama_stack_client):
+    register_dataset(llama_stack_client, for_rag=True)
+    response = llama_stack_client.datasets.list()
+    assert len(response) == 1
+
+    # scoring individual rows
+    rows = llama_stack_client.datasetio.get_rows_paginated(
+        dataset_id="test_dataset",
+        rows_in_page=3,
+    )
+    assert len(rows.rows) == 3
+
+    scoring_fns_list = llama_stack_client.scoring_functions.list()
+    scoring_functions = {
+        scoring_fns_list[0].identifier: None,
+    }
+
+    response = llama_stack_client.scoring.score(
+        input_rows=rows.rows,
+        scoring_functions=scoring_functions,
+    )
+    assert len(response.results) == len(scoring_functions)
+    for x in scoring_functions:
+        assert x in response.results
+        assert len(response.results[x].score_rows) == len(rows.rows)
+
+    # score batch
+    response = llama_stack_client.scoring.score_batch(
+        dataset_id="test_dataset",
+        scoring_functions=scoring_functions,
+        save_results_dataset=False,
+    )
+    assert len(response.results) == len(scoring_functions)
+    for x in scoring_functions:
+        assert x in response.results
+        assert len(response.results[x].score_rows) == 5
+
+
+def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id):
+    register_dataset(llama_stack_client, for_rag=True)
+    response = llama_stack_client.datasets.list()
+    assert len(response) == 1
+
+    # scoring individual rows
+    rows = llama_stack_client.datasetio.get_rows_paginated(
+        dataset_id="test_dataset",
+        rows_in_page=3,
+    )
+    assert len(rows.rows) == 3
+
+    scoring_functions = {
+        "llm-as-judge::base": dict(
+            type="llm_as_judge",
+            judge_model=judge_model_id,
+            prompt_template=sample_judge_prompt_template,
+            judge_score_regexes=[r"Score: (\d+)"],
+            aggregation_functions=[
+                "categorical_count",
+            ],
+        )
+    }
+
+    response = llama_stack_client.scoring.score(
+        input_rows=rows.rows,
+        scoring_functions=scoring_functions,
+    )
+    assert len(response.results) == len(scoring_functions)
+    for x in scoring_functions:
+        assert x in response.results
+        assert len(response.results[x].score_rows) == len(rows.rows)
+
+    # score batch
+    response = llama_stack_client.scoring.score_batch(
+        dataset_id="test_dataset",
+        scoring_functions=scoring_functions,
+        save_results_dataset=False,
+    )
+    assert len(response.results) == len(scoring_functions)
+    for x in scoring_functions:
+        assert x in response.results
+        assert len(response.results[x].score_rows) == 5
+
+
+@pytest.mark.skip(reason="Skipping because this seems to be really slow")
+def test_scoring_score_with_aggregation_functions(llama_stack_client, sample_judge_prompt_template, judge_model_id):
+    register_dataset(llama_stack_client, for_rag=True)
+    rows = llama_stack_client.datasetio.get_rows_paginated(
+        dataset_id="test_dataset",
+        rows_in_page=3,
+    )
+    assert len(rows.rows) == 3
+
+    scoring_fns_list = llama_stack_client.scoring_functions.list()
+    scoring_functions = {}
+    aggr_fns = [
+        "accuracy",
+        "median",
+        "categorical_count",
+        "average",
+    ]
+    for x in scoring_fns_list:
+        if x.provider_id == "llm-as-judge":
+            aggr_fns = ["categorical_count"]
+            scoring_functions[x.identifier] = dict(
+                type="llm_as_judge",
+                judge_model=judge_model_id,
+                prompt_template=sample_judge_prompt_template,
+                judge_score_regexes=[r"Score: (\d+)"],
+                aggregation_functions=aggr_fns,
+            )
+        elif x.provider_id == "basic" or x.provider_id == "braintrust":
+            if "regex_parser" in x.identifier:
+                scoring_functions[x.identifier] = dict(
+                    type="regex_parser",
+                    parsing_regexes=[r"Score: (\d+)"],
+                    aggregation_functions=aggr_fns,
+                )
+            else:
+                scoring_functions[x.identifier] = dict(
+                    type="basic",
+                    aggregation_functions=aggr_fns,
+                )
+        else:
+            scoring_functions[x.identifier] = None
+
+    response = llama_stack_client.scoring.score(
+        input_rows=rows.rows,
+        scoring_functions=scoring_functions,
+    )
+
+    assert len(response.results) == len(scoring_functions)
+    for x in scoring_functions:
+        assert x in response.results
+        assert len(response.results[x].score_rows) == len(rows.rows)
+        assert len(response.results[x].aggregated_results) == len(aggr_fns)
diff --git a/tests/integration/tool_runtime/test_builtin_tools.py b/tests/integration/tool_runtime/test_builtin_tools.py
new file mode 100644
index 000000000..9edf3afa0
--- /dev/null
+++ b/tests/integration/tool_runtime/test_builtin_tools.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import os
+
+import pytest
+
+
+@pytest.fixture
+def sample_search_query():
+    return "What are the latest developments in quantum computing?"
+
+
+@pytest.fixture
+def sample_wolfram_alpha_query():
+    return "What is the square root of 16?"
+
+
+def test_web_search_tool(llama_stack_client, sample_search_query):
+    """Test the web search tool functionality."""
+    if "TAVILY_SEARCH_API_KEY" not in os.environ:
+        pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
+
+    response = llama_stack_client.tool_runtime.invoke_tool(
+        tool_name="web_search", kwargs={"query": sample_search_query}
+    )
+
+    # Verify the response
+    assert response.content is not None
+    assert len(response.content) > 0
+    assert isinstance(response.content, str)
+
+    content = json.loads(response.content)
+    assert "query" in content
+    assert "top_k" in content
+    assert len(content["top_k"]) > 0
+
+    first = content["top_k"][0]
+    assert "title" in first
+    assert "url" in first
+
+
+def test_wolfram_alpha_tool(llama_stack_client, sample_wolfram_alpha_query):
+    """Test the wolfram alpha tool functionality."""
+    if "WOLFRAM_ALPHA_API_KEY" not in os.environ:
+        pytest.skip("WOLFRAM_ALPHA_API_KEY not set, skipping test")
+
+    response = llama_stack_client.tool_runtime.invoke_tool(
+        tool_name="wolfram_alpha", kwargs={"query": sample_wolfram_alpha_query}
+    )
+
+    print(response.content)
+    assert response.content is not None
+    assert len(response.content) > 0
+    assert isinstance(response.content, str)
+
+    content = json.loads(response.content)
+    result = content["queryresult"]
+    assert "success" in result
+    assert result["success"]
+    assert "pods" in result
+    assert len(result["pods"]) > 0
diff --git a/tests/integration/tool_runtime/test_rag_tool.py b/tests/integration/tool_runtime/test_rag_tool.py
index e330a10f5..c49f507a8 100644
--- a/tests/integration/tool_runtime/test_rag_tool.py
+++ b/tests/integration/tool_runtime/test_rag_tool.py
@@ -4,29 +4,23 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import random
-
 import pytest
 from llama_stack_client.types import Document
 
 
 @pytest.fixture(scope="function")
-def empty_vector_db_registry(llama_stack_client):
-    vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
-    for vector_db_id in vector_dbs:
-        llama_stack_client.vector_dbs.unregister(vector_db_id=vector_db_id)
+def client_with_empty_registry(client_with_models):
+    def clear_registry():
+        vector_dbs = [vector_db.identifier for vector_db in client_with_models.vector_dbs.list()]
+        for vector_db_id in vector_dbs:
+            client_with_models.vector_dbs.unregister(vector_db_id=vector_db_id)
 
+    clear_registry()
+    yield client_with_models
 
-@pytest.fixture(scope="function")
-def single_entry_vector_db_registry(llama_stack_client, empty_vector_db_registry):
-    vector_db_id = f"test_vector_db_{random.randint(1000, 9999)}"
-    llama_stack_client.vector_dbs.register(
-        vector_db_id=vector_db_id,
-        embedding_model="all-MiniLM-L6-v2",
-        embedding_dimension=384,
-    )
-    vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
-    return vector_dbs
+    # you must clean after the last test if you were running tests against
+    # a stateful server instance
+    clear_registry()
 
 
 @pytest.fixture(scope="session")
@@ -63,9 +57,15 @@ def assert_valid_response(response):
         assert isinstance(chunk.content, str)
 
 
-def test_vector_db_insert_inline_and_query(llama_stack_client, single_entry_vector_db_registry, sample_documents):
-    vector_db_id = single_entry_vector_db_registry[0]
-    llama_stack_client.tool_runtime.rag_tool.insert(
+def test_vector_db_insert_inline_and_query(client_with_empty_registry, sample_documents, embedding_model_id):
+    vector_db_id = "test_vector_db"
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=384,
+    )
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
         documents=sample_documents,
         chunk_size_in_tokens=512,
         vector_db_id=vector_db_id,
@@ -73,7 +73,7 @@ def test_vector_db_insert_inline_and_query(llama_stack_client, single_entry_vect
 
     # Query with a direct match
     query1 = "programming language"
-    response1 = llama_stack_client.vector_io.query(
+    response1 = client_with_empty_registry.vector_io.query(
         vector_db_id=vector_db_id,
         query=query1,
     )
@@ -82,7 +82,7 @@ def test_vector_db_insert_inline_and_query(llama_stack_client, single_entry_vect
 
     # Query with semantic similarity
     query2 = "AI and brain-inspired computing"
-    response2 = llama_stack_client.vector_io.query(
+    response2 = client_with_empty_registry.vector_io.query(
         vector_db_id=vector_db_id,
         query=query2,
     )
@@ -91,7 +91,7 @@ def test_vector_db_insert_inline_and_query(llama_stack_client, single_entry_vect
 
     # Query with limit on number of results (max_chunks=2)
     query3 = "computer"
-    response3 = llama_stack_client.vector_io.query(
+    response3 = client_with_empty_registry.vector_io.query(
         vector_db_id=vector_db_id,
         query=query3,
         params={"max_chunks": 2},
@@ -101,7 +101,7 @@ def test_vector_db_insert_inline_and_query(llama_stack_client, single_entry_vect
 
     # Query with threshold on similarity score
     query4 = "computer"
-    response4 = llama_stack_client.vector_io.query(
+    response4 = client_with_empty_registry.vector_io.query(
         vector_db_id=vector_db_id,
         query=query4,
         params={"score_threshold": 0.01},
@@ -110,20 +110,20 @@ def test_vector_db_insert_inline_and_query(llama_stack_client, single_entry_vect
     assert all(score >= 0.01 for score in response4.scores)
 
 
-def test_vector_db_insert_from_url_and_query(llama_stack_client, empty_vector_db_registry):
-    providers = [p for p in llama_stack_client.providers.list() if p.api == "vector_io"]
+def test_vector_db_insert_from_url_and_query(client_with_empty_registry, sample_documents, embedding_model_id):
+    providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"]
     assert len(providers) > 0
 
     vector_db_id = "test_vector_db"
 
-    llama_stack_client.vector_dbs.register(
+    client_with_empty_registry.vector_dbs.register(
         vector_db_id=vector_db_id,
-        embedding_model="all-MiniLM-L6-v2",
+        embedding_model=embedding_model_id,
         embedding_dimension=384,
     )
 
     # list to check memory bank is successfully registered
-    available_vector_dbs = [vector_db.identifier for vector_db in llama_stack_client.vector_dbs.list()]
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
     assert vector_db_id in available_vector_dbs
 
     # URLs of documents to insert
@@ -144,14 +144,14 @@ def test_vector_db_insert_from_url_and_query(llama_stack_client, empty_vector_db
         for i, url in enumerate(urls)
     ]
 
-    llama_stack_client.tool_runtime.rag_tool.insert(
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
         documents=documents,
         vector_db_id=vector_db_id,
         chunk_size_in_tokens=512,
     )
 
     # Query for the name of method
-    response1 = llama_stack_client.vector_io.query(
+    response1 = client_with_empty_registry.vector_io.query(
         vector_db_id=vector_db_id,
         query="What's the name of the fine-tunning method used?",
     )
@@ -159,7 +159,7 @@ def test_vector_db_insert_from_url_and_query(llama_stack_client, empty_vector_db
     assert any("lora" in chunk.content.lower() for chunk in response1.chunks)
 
     # Query for the name of model
-    response2 = llama_stack_client.vector_io.query(
+    response2 = client_with_empty_registry.vector_io.query(
         vector_db_id=vector_db_id,
         query="Which Llama model is mentioned?",
     )