Merge branch 'meta-llama:main' into add-unit-tests-and-fix-cli

2025-12-31 11:43:55 +00:00 · 2025-03-31 21:17:48 -04:00 · 2025-03-31 21:17:48 -04:00 · 696bcf6051
commit 696bcf6051
parent 6beb80f2ac b440a1dc42
459 changed files with 39114 additions and 10751 deletions
--- a/tests/client-sdk/post_training/init.py
+++ b/tests/client-sdk/post_training/init.py
--- a/tests/client-sdk/post_training/test_supervied_fine_tuning.py
+++ b/tests/client-sdk/post_training/test_supervied_fine_tuning.py
@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+
+POST_TRAINING_PROVIDER_TYPES = ["remote::nvidia"]
+
+
+@pytest.mark.integration
+@pytest.fixture(scope="session")
+def post_training_provider_available(llama_stack_client):
+    providers = llama_stack_client.providers.list()
+    post_training_providers = [p for p in providers if p.provider_type in POST_TRAINING_PROVIDER_TYPES]
+    return len(post_training_providers) > 0
+
+
+@pytest.mark.integration
+def test_post_training_provider_registration(llama_stack_client, post_training_provider_available):
+    """Check if post_training is in the api list.
+    This is a sanity check to ensure the provider is registered."""
+    if not post_training_provider_available:
+        pytest.skip("post training provider not available")
+
+    providers = llama_stack_client.providers.list()
+    post_training_providers = [p for p in providers if p.provider_type in POST_TRAINING_PROVIDER_TYPES]
+    assert len(post_training_providers) > 0
+
+
+@pytest.mark.integration
+def test_get_training_jobs(llama_stack_client, post_training_provider_available):
+    """Test listing all training jobs."""
+    if not post_training_provider_available:
+        pytest.skip("post training provider not available")
+
+    jobs = llama_stack_client.post_training.get_training_jobs()
+    assert isinstance(jobs, dict)
+    assert "data" in jobs
+    assert isinstance(jobs["data"], list)
+
+
+@pytest.mark.integration
+def test_get_training_job_status(llama_stack_client, post_training_provider_available):
+    """Test getting status of a specific training job."""
+    if not post_training_provider_available:
+        pytest.skip("post training provider not available")
+
+    jobs = llama_stack_client.post_training.get_training_jobs()
+    if not jobs["data"]:
+        pytest.skip("No training jobs available to check status")
+
+    job_uuid = jobs["data"][0]["job_uuid"]
+    job_status = llama_stack_client.post_training.get_training_job_status(job_uuid=job_uuid)
+
+    assert job_status is not None
+    assert "job_uuid" in job_status
+    assert "status" in job_status
+    assert job_status["job_uuid"] == job_uuid
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -23,8 +23,8 @@ Model parameters can be influenced by the following options:
 - `--judge-model`: comma-separated list of judge models.
 - `--embedding-dimension`: output dimensionality of the embedding model to use for testing. Default: 384

-Each of these are comma-separated lists and can be used to generate multiple parameter combinations.
-
+Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
+if no model is specified.

 Experimental, under development, options:
 - `--record-responses`: record new API responses instead of using cached ones
@ -36,7 +36,7 @@ Experimental, under development, options:
 Run all text inference tests with the `together` distribution:

 ```bash
-pytest -s -v tests/api/inference/test_text_inference.py \
+pytest -s -v tests/integration/inference/test_text_inference.py \
   --stack-config=together \
   --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
@ -44,7 +44,7 @@ pytest -s -v tests/api/inference/test_text_inference.py \
 Run all text inference tests with the `together` distribution and `meta-llama/Llama-3.1-8B-Instruct`:

 ```bash
-pytest -s -v tests/api/inference/test_text_inference.py \
+pytest -s -v tests/integration/inference/test_text_inference.py \
   --stack-config=together \
   --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
@ -57,7 +57,7 @@ VISION_MODELS=meta-llama/Llama-3.2-11B-Vision-Instruct
 EMBEDDING_MODELS=all-MiniLM-L6-v2
 export TOGETHER_API_KEY=<together_api_key>

-pytest -s -v tests/api/inference/ \
+pytest -s -v tests/integration/inference/ \
   --stack-config=together \
   --text-model=$TEXT_MODELS \
   --vision-model=$VISION_MODELS \
@ -69,7 +69,7 @@ Same thing but instead of using the distribution, use an adhoc stack with just o
 ```bash
 export FIREWORKS_API_KEY=<fireworks_api_key>

-pytest -s -v tests/api/inference/ \
+pytest -s -v tests/integration/inference/ \
   --stack-config=inference=fireworks \
   --text-model=$TEXT_MODELS \
   --vision-model=$VISION_MODELS \
@ -81,7 +81,7 @@ Running Vector IO tests for a number of embedding models:
 ```bash
 EMBEDDING_MODELS=all-MiniLM-L6-v2

-pytest -s -v tests/api/vector_io/ \
+pytest -s -v tests/integration/vector_io/ \
   --stack-config=inference=sentence-transformers,vector_io=sqlite-vec \
   --embedding-model=$EMBEDDING_MODELS
 ```
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@ -8,10 +8,7 @@ from typing import Any, Dict
 from uuid import uuid4

 import pytest
-from llama_stack_client.lib.agents.agent import Agent
-from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument
-from llama_stack_client.types.memory_insert_params import Document
+from llama_stack_client import Agent, AgentEventLogger, Document
 from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig

 from llama_stack.apis.agents.agents import (
@ -93,7 +90,7 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
        session_id=session_id,
    )

-    logs = [str(log) for log in EventLogger().log(simple_hello) if log is not None]
+    logs = [str(log) for log in AgentEventLogger().log(simple_hello) if log is not None]
    logs_str = "".join(logs)

    assert "hello" in logs_str.lower()
@ -112,7 +109,7 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
            session_id=session_id,
        )

-        logs = [str(log) for log in EventLogger().log(bomb_response) if log is not None]
+        logs = [str(log) for log in AgentEventLogger().log(bomb_response) if log is not None]
        logs_str = "".join(logs)
        assert "I can't" in logs_str

@ -176,6 +173,7 @@ def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
 def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent_config):
    agent_config = {
        **agent_config,
+        "instructions": "You are a helpful assistant that can use web search to answer questions.",
        "tools": [
            "builtin::websearch",
        ],
@ -187,20 +185,20 @@ def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent
        messages=[
            {
                "role": "user",
-                "content": "Search the web and tell me who the current CEO of Meta is.",
+                "content": "Search the web and tell me what is the local time in Tokyo currently.",
            }
        ],
        session_id=session_id,
+        stream=False,
    )

-    logs = [str(log) for log in EventLogger().log(response) if log is not None]
-    logs_str = "".join(logs)
-
-    assert "tool_execution>" in logs_str
-    assert "Tool:brave_search Response:" in logs_str
-    assert "mark zuckerberg" in logs_str.lower()
-    if len(agent_config["output_shields"]) > 0:
-        assert "No Violation" in logs_str
+    found_tool_execution = False
+    for step in response.steps:
+        if step.step_type == "tool_execution":
+            assert step.tool_calls[0].tool_name == "brave_search"
+            found_tool_execution = True
+            break
+    assert found_tool_execution


 def test_builtin_tool_code_execution(llama_stack_client_with_mocked_inference, agent_config):
@ -222,7 +220,7 @@ def test_builtin_tool_code_execution(llama_stack_client_with_mocked_inference, a
        ],
        session_id=session_id,
    )
-    logs = [str(log) for log in EventLogger().log(response) if log is not None]
+    logs = [str(log) for log in AgentEventLogger().log(response) if log is not None]
    logs_str = "".join(logs)

    assert "541" in logs_str
@ -242,7 +240,7 @@ def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inferen

    codex_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
    session_id = codex_agent.create_session(f"test-session-{uuid4()}")
-    inflation_doc = AgentDocument(
+    inflation_doc = Document(
        content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
        mime_type="text/csv",
    )
@ -263,7 +261,7 @@ def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inferen
            session_id=session_id,
            documents=input.get("documents", None),
        )
-        logs = [str(log) for log in EventLogger().log(response) if log is not None]
+        logs = [str(log) for log in AgentEventLogger().log(response) if log is not None]
        logs_str = "".join(logs)
        assert "Tool:code_interpreter" in logs_str

@ -272,7 +270,7 @@ def test_custom_tool(llama_stack_client_with_mocked_inference, agent_config):
    client_tool = get_boiling_point
    agent_config = {
        **agent_config,
-        "tools": ["builtin::websearch", client_tool],
+        "tools": [client_tool],
    }

    agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
@ -288,7 +286,7 @@ def test_custom_tool(llama_stack_client_with_mocked_inference, agent_config):
        session_id=session_id,
    )

-    logs = [str(log) for log in EventLogger().log(response) if log is not None]
+    logs = [str(log) for log in AgentEventLogger().log(response) if log is not None]
    logs_str = "".join(logs)
    assert "-100" in logs_str
    assert "get_boiling_point" in logs_str
@ -321,42 +319,55 @@ def test_custom_tool_infinite_loop(llama_stack_client_with_mocked_inference, age
    assert num_tool_calls <= 5


-def test_tool_choice(llama_stack_client_with_mocked_inference, agent_config):
-    def run_agent(tool_choice):
-        client_tool = get_boiling_point
-
-        test_agent_config = {
-            **agent_config,
-            "tool_config": {"tool_choice": tool_choice},
-            "tools": [client_tool],
-        }
-
-        agent = Agent(llama_stack_client_with_mocked_inference, **test_agent_config)
-        session_id = agent.create_session(f"test-session-{uuid4()}")
-
-        response = agent.create_turn(
-            messages=[
-                {
-                    "role": "user",
-                    "content": "What is the boiling point of polyjuice?",
-                },
-            ],
-            session_id=session_id,
-            stream=False,
-        )
-
-        return [step for step in response.steps if step.step_type == "tool_execution"]
-
-    tool_execution_steps = run_agent("required")
+def test_tool_choice_required(llama_stack_client_with_mocked_inference, agent_config):
+    tool_execution_steps = run_agent_with_tool_choice(
+        llama_stack_client_with_mocked_inference, agent_config, "required"
+    )
    assert len(tool_execution_steps) > 0

-    tool_execution_steps = run_agent("none")
+
+def test_tool_choice_none(llama_stack_client_with_mocked_inference, agent_config):
+    tool_execution_steps = run_agent_with_tool_choice(llama_stack_client_with_mocked_inference, agent_config, "none")
    assert len(tool_execution_steps) == 0

-    tool_execution_steps = run_agent("get_boiling_point")
+
+def test_tool_choice_get_boiling_point(llama_stack_client_with_mocked_inference, agent_config):
+    if "llama" not in agent_config["model"].lower():
+        pytest.xfail("NotImplemented for non-llama models")
+
+    tool_execution_steps = run_agent_with_tool_choice(
+        llama_stack_client_with_mocked_inference, agent_config, "get_boiling_point"
+    )
    assert len(tool_execution_steps) >= 1 and tool_execution_steps[0].tool_calls[0].tool_name == "get_boiling_point"


+def run_agent_with_tool_choice(client, agent_config, tool_choice):
+    client_tool = get_boiling_point
+
+    test_agent_config = {
+        **agent_config,
+        "tool_config": {"tool_choice": tool_choice},
+        "tools": [client_tool],
+        "max_infer_iters": 2,
+    }
+
+    agent = Agent(client, **test_agent_config)
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+
+    response = agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "What is the boiling point of polyjuice?",
+            },
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+
+    return [step for step in response.steps if step.step_type == "tool_execution"]
+
+
@pytest.mark.parametrize("rag_tool_name", ["builtin::rag/knowledge_search", "builtin::rag"])
 def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_tool_name):
    urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
@ -417,19 +428,7 @@ def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_t
            assert expected_kw in response.output_message.content.lower()


-@pytest.mark.parametrize(
-    "tool",
-    [
-        dict(
-            name="builtin::rag/knowledge_search",
-            args={
-                "vector_db_ids": [],
-            },
-        ),
-        "builtin::rag/knowledge_search",
-    ],
-)
-def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config, tool):
+def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config):
    urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
    documents = [
        Document(
@ -442,7 +441,6 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
    ]
    agent_config = {
        **agent_config,
-        "tools": [tool],
    }
    rag_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
    session_id = rag_agent.create_session(f"test-session-{uuid4()}")
@ -476,10 +474,6 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
            stream=False,
        )

-    # rag is called
-    tool_execution_step = [step for step in response.steps if step.step_type == "tool_execution"]
-    assert len(tool_execution_step) >= 1
-    assert tool_execution_step[0].tool_calls[0].tool_name == "knowledge_search"
    assert "lora" in response.output_message.content.lower()


@ -526,19 +520,7 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
        ],
    }
    agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
-    inflation_doc = Document(
-        document_id="test_csv",
-        content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
-        mime_type="text/csv",
-        metadata={},
-    )
    user_prompts = [
-        (
-            "Here is a csv file, can you describe it?",
-            [inflation_doc],
-            "code_interpreter",
-            "",
-        ),
        (
            "when was Perplexity the company founded?",
            [],
@ -572,7 +554,7 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
    [(get_boiling_point, False), (get_boiling_point_with_metadata, True)],
 )
 def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_config, client_tools):
-    client_tool, expectes_metadata = client_tools
+    client_tool, expects_metadata = client_tools
    agent_config = {
        **agent_config,
        "input_shields": [],
@ -598,7 +580,7 @@ def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_co
    assert steps[0].step_type == "inference"
    assert steps[1].step_type == "tool_execution"
    assert steps[1].tool_calls[0].tool_name.startswith("get_boiling_point")
-    if expectes_metadata:
+    if expects_metadata:
        assert steps[1].tool_responses[0].metadata["source"] == "https://www.google.com"
    assert steps[2].step_type == "inference"

@ -610,3 +592,44 @@ def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_co
            assert last_step_completed_at < step.started_at
            assert step.started_at < step.completed_at
            last_step_completed_at = step.completed_at
+
+
+def test_multi_tool_calls(llama_stack_client_with_mocked_inference, agent_config):
+    if "gpt" not in agent_config["model"]:
+        pytest.xfail("Only tested on GPT models")
+
+    agent_config = {
+        **agent_config,
+        "tools": [get_boiling_point],
+    }
+
+    agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+
+    response = agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "Call get_boiling_point twice to answer: What is the boiling point of polyjuice in both celsius and fahrenheit?",
+            },
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+    steps = response.steps
+    assert len(steps) == 7
+    assert steps[0].step_type == "shield_call"
+    assert steps[1].step_type == "inference"
+    assert steps[2].step_type == "shield_call"
+    assert steps[3].step_type == "tool_execution"
+    assert steps[4].step_type == "shield_call"
+    assert steps[5].step_type == "inference"
+    assert steps[6].step_type == "shield_call"
+
+    tool_execution_step = steps[3]
+    assert len(tool_execution_step.tool_calls) == 2
+    assert tool_execution_step.tool_calls[0].tool_name.startswith("get_boiling_point")
+    assert tool_execution_step.tool_calls[1].tool_name.startswith("get_boiling_point")
+
+    output = response.output_message.content.lower()
+    assert "-100" in output and "-212" in output
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -6,12 +6,24 @@
 import inspect
 import itertools
 import os
+import platform
 import textwrap
+import time

 from dotenv import load_dotenv

+from llama_stack.log import get_logger
+
 from .report import Report

+logger = get_logger(__name__, category="tests")
+
+
+def pytest_runtest_teardown(item):
+    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
+    if interval_seconds:
+        time.sleep(float(interval_seconds))
+

 def pytest_configure(config):
    config.option.tbstyle = "short"
@ -24,6 +36,10 @@ def pytest_configure(config):
        key, value = env_var.split("=", 1)
        os.environ[key] = value

+    if platform.system() == "Darwin":  # Darwin is the system name for macOS
+        os.environ["DISABLE_CODE_SANDBOX"] = "1"
+        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
+
    if config.getoption("--report"):
        config.pluginmanager.register(Report(config))

--- a/tests/integration/datasetio/test_datasetio.py
+++ b/tests/integration/datasetio/test_datasetio.py
@ -1,101 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import mimetypes
-import os
-from pathlib import Path
-
-# How to run this test:
-#
-# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio
-
-
-def data_url_from_file(file_path: str) -> str:
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
-
-    with open(file_path, "rb") as file:
-        file_content = file.read()
-
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
-
-
-def register_dataset(llama_stack_client, for_generation=False, for_rag=False, dataset_id="test_dataset"):
-    if for_rag:
-        test_file = Path(os.path.abspath(__file__)).parent / "test_rag_dataset.csv"
-    else:
-        test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv"
-    test_url = data_url_from_file(str(test_file))
-
-    if for_generation:
-        dataset_schema = {
-            "expected_answer": {"type": "string"},
-            "input_query": {"type": "string"},
-            "chat_completion_input": {"type": "chat_completion_input"},
-        }
-    elif for_rag:
-        dataset_schema = {
-            "expected_answer": {"type": "string"},
-            "input_query": {"type": "string"},
-            "generated_answer": {"type": "string"},
-            "context": {"type": "string"},
-        }
-    else:
-        dataset_schema = {
-            "expected_answer": {"type": "string"},
-            "input_query": {"type": "string"},
-            "generated_answer": {"type": "string"},
-        }
-
-    dataset_providers = [x for x in llama_stack_client.providers.list() if x.api == "datasetio"]
-    dataset_provider_id = dataset_providers[0].provider_id
-
-    llama_stack_client.datasets.register(
-        dataset_id=dataset_id,
-        dataset_schema=dataset_schema,
-        url=dict(uri=test_url),
-        provider_id=dataset_provider_id,
-    )
-
-
-def test_register_unregister_dataset(llama_stack_client):
-    register_dataset(llama_stack_client)
-    response = llama_stack_client.datasets.list()
-    assert isinstance(response, list)
-    assert len(response) == 1
-    assert response[0].identifier == "test_dataset"
-
-    llama_stack_client.datasets.unregister("test_dataset")
-    response = llama_stack_client.datasets.list()
-    assert isinstance(response, list)
-    assert len(response) == 0
-
-
-def test_get_rows_paginated(llama_stack_client):
-    register_dataset(llama_stack_client)
-    response = llama_stack_client.datasetio.get_rows_paginated(
-        dataset_id="test_dataset",
-        rows_in_page=3,
-    )
-    assert isinstance(response.rows, list)
-    assert len(response.rows) == 3
-    assert response.next_page_token == "3"
-
-    # iterate over all rows
-    response = llama_stack_client.datasetio.get_rows_paginated(
-        dataset_id="test_dataset",
-        rows_in_page=2,
-        page_token=response.next_page_token,
-    )
-    assert isinstance(response.rows, list)
-    assert len(response.rows) == 2
-    assert response.next_page_token == "5"
--- a/tests/integration/datasets/init.py
+++ b/tests/integration/datasets/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/tests/integration/datasetio/test_dataset.csv
+++ b/tests/integration/datasetio/test_dataset.csv
--- a/tests/integration/datasets/test_datasets.py
+++ b/tests/integration/datasets/test_datasets.py
@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import base64
+import mimetypes
+import os
+
+import pytest
+
+# How to run this test:
+#
+# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasets
+
+
+def data_url_from_file(file_path: str) -> str:
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type, _ = mimetypes.guess_type(file_path)
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return data_url
+
+
+@pytest.mark.parametrize(
+    "purpose, source, provider_id, limit",
+    [
+        (
+            "eval/messages-answer",
+            {
+                "type": "uri",
+                "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
+            },
+            "huggingface",
+            10,
+        ),
+        (
+            "eval/messages-answer",
+            {
+                "type": "rows",
+                "rows": [
+                    {
+                        "messages": [{"role": "user", "content": "Hello, world!"}],
+                        "answer": "Hello, world!",
+                    },
+                    {
+                        "messages": [
+                            {
+                                "role": "user",
+                                "content": "What is the capital of France?",
+                            }
+                        ],
+                        "answer": "Paris",
+                    },
+                ],
+            },
+            "localfs",
+            2,
+        ),
+        (
+            "eval/messages-answer",
+            {
+                "type": "uri",
+                "uri": data_url_from_file(os.path.join(os.path.dirname(__file__), "test_dataset.csv")),
+            },
+            "localfs",
+            5,
+        ),
+    ],
+)
+def test_register_and_iterrows(llama_stack_client, purpose, source, provider_id, limit):
+    dataset = llama_stack_client.datasets.register(
+        purpose=purpose,
+        source=source,
+    )
+    assert dataset.identifier is not None
+    assert dataset.provider_id == provider_id
+    iterrow_response = llama_stack_client.datasets.iterrows(dataset.identifier, limit=limit)
+    assert len(iterrow_response.data) == limit
+
+    dataset_list = llama_stack_client.datasets.list()
+    assert dataset.identifier in [d.identifier for d in dataset_list]
+
+    llama_stack_client.datasets.unregister(dataset.identifier)
+    dataset_list = llama_stack_client.datasets.list()
+    assert dataset.identifier not in [d.identifier for d in dataset_list]
--- a/tests/integration/datasetio/test_rag_dataset.csv
+++ b/tests/integration/datasetio/test_rag_dataset.csv
--- a/tests/integration/eval/test_eval.py
+++ b/tests/integration/eval/test_eval.py
@ -4,10 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import uuid
+from pathlib import Path

 import pytest

-from ..datasetio.test_datasetio import register_dataset
+from ..datasets.test_datasets import data_url_from_file

 # How to run this test:
 #
@ -16,15 +17,21 @@ from ..datasetio.test_datasetio import register_dataset

@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
 def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
-    register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval")
-    response = llama_stack_client.datasets.list()
-    assert any(x.identifier == "test_dataset_for_eval" for x in response)
-
-    rows = llama_stack_client.datasetio.get_rows_paginated(
-        dataset_id="test_dataset_for_eval",
-        rows_in_page=3,
+    dataset = llama_stack_client.datasets.register(
+        purpose="eval/messages-answer",
+        source={
+            "type": "uri",
+            "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
+        },
    )
-    assert len(rows.rows) == 3
+    response = llama_stack_client.datasets.list()
+    assert any(x.identifier == dataset.identifier for x in response)
+
+    rows = llama_stack_client.datasets.iterrows(
+        dataset_id=dataset.identifier,
+        limit=3,
+    )
+    assert len(rows.data) == 3

    scoring_functions = [
        scoring_fn_id,
@ -32,7 +39,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
    benchmark_id = str(uuid.uuid4())
    llama_stack_client.benchmarks.register(
        benchmark_id=benchmark_id,
-        dataset_id="test_dataset_for_eval",
+        dataset_id=dataset.identifier,
        scoring_functions=scoring_functions,
    )
    list_benchmarks = llama_stack_client.benchmarks.list()
@ -40,7 +47,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):

    response = llama_stack_client.eval.evaluate_rows(
        benchmark_id=benchmark_id,
-        input_rows=rows.rows,
+        input_rows=rows.data,
        scoring_functions=scoring_functions,
        benchmark_config={
            "eval_candidate": {
@ -59,11 +66,17 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):

@pytest.mark.parametrize("scoring_fn_id", ["basic::subset_of"])
 def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
-    register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval_2")
+    dataset = llama_stack_client.datasets.register(
+        purpose="eval/messages-answer",
+        source={
+            "type": "uri",
+            "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
+        },
+    )
    benchmark_id = str(uuid.uuid4())
    llama_stack_client.benchmarks.register(
        benchmark_id=benchmark_id,
-        dataset_id="test_dataset_for_eval_2",
+        dataset_id=dataset.identifier,
        scoring_functions=[scoring_fn_id],
    )

@ -81,7 +94,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
    )
    assert response.job_id == "0"
    job_status = llama_stack_client.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id)
-    assert job_status and job_status == "completed"
+    assert job_status and job_status.status == "completed"

    eval_response = llama_stack_client.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id)
    assert eval_response is not None
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@ -52,6 +52,8 @@ def llama_stack_client_with_mocked_inference(llama_stack_client, request):

    If --record-responses is passed, it will call the real APIs and record the responses.
    """
+    # TODO: will rework this to be more stable
+    return llama_stack_client
    if not isinstance(llama_stack_client, LlamaStackAsLibraryClient):
        logging.warning(
            "llama_stack_client_with_mocked_inference is not supported for this client, returning original client without mocking"
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@ -5,7 +5,7 @@
      "__module__": "llama_stack.apis.tools.tools",
      "__pydantic__": "ToolInvocationResult",
      "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stdout]\n541\n[/stdout]",
        "error_code": null,
        "error_message": null,
        "metadata": null
@ -31,7 +31,7 @@
      "__module__": "llama_stack.apis.tools.tools",
      "__pydantic__": "ToolInvocationResult",
      "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stdout]\nNumber of rows and columns in the data: (10, 13)\nColumns of the data are: 13\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\n       'Oct', 'Nov', 'Dec'],\n      dtype='object')\nDatatype of the columns are: Year      int64\nJan     float64\nFeb     float64\nMar     float64\nApr     float64\nMay     float64\nJun     float64\nJul     float64\nAug     float64\nSep     float64\nOct     float64\nNov     float64\nDec     float64\ndtype: object\n[/stdout]",
        "error_code": null,
        "error_message": null,
        "metadata": null
@ -70,7 +70,7 @@
      "__module__": "llama_stack.apis.tools.tools",
      "__pydantic__": "ToolInvocationResult",
      "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stdout]\nYear  Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec\n0  2014  1.6  1.6  1.7  1.8  2.0  1.9  1.9  1.7  1.7  1.8  1.7  1.6\n1  2015  1.6  1.7  1.8  1.8  1.7  1.8  1.8  1.8  1.9  1.9  2.0  2.1\n2  2016  2.2  2.3  2.2  2.1  2.2  2.2  2.2  2.3  2.2  2.1  2.1  2.2\n3  2017  2.3  2.2  2.0  1.9  1.7  1.7  1.7  1.7  1.7  1.8  1.7  1.8\n4  2018  1.8  1.8  2.1  2.1  2.2  2.3  2.4  2.2  2.2  2.1  2.2  2.2\n[/stdout]",
        "error_code": null,
        "error_message": null,
        "metadata": null
@ -83,7 +83,7 @@
      "__module__": "llama_stack.apis.tools.tools",
      "__pydantic__": "ToolInvocationResult",
      "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 142, in <module>\n  line 23, in <module>\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\nImportError: attempted relative import with no known parent package\n[/stderr]",
        "error_code": null,
        "error_message": null,
        "metadata": null
@ -116,6 +116,19 @@
      }
    }
  },
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
    "type": "value",
    "value": {
@ -154,23 +167,23 @@
            "type": "text"
          },
          {
-            "text": "Result 1:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:c4e00\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
            "type": "text"
          },
          {
-            "text": "Result 2:\nDocument_id:961ff\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:c4e00\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
            "type": "text"
          },
          {
-            "text": "Result 3:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
            "type": "text"
          },
          {
-            "text": "Result 4:\nDocument_id:961ff\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:c4e00\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
            "type": "text"
          },
          {
-            "text": "Result 5:\nDocument_id:961ff\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:c4e00\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
            "type": "text"
          },
          {
@ -182,11 +195,11 @@
        "error_message": null,
        "metadata": {
          "document_ids": [
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932"
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19"
          ]
        }
      }
@ -242,6 +255,19 @@
      }
    }
  },
+  "[[], {\"kwargs\": {\"query\": \"Meta founder\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"web_search\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.46759978, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company. ... Mark Zuckerberg, Founder, Chairman and Chief Executive Officer. Nick Clegg, President, Global Affairs. Joel Kaplan, Chief Global Affairs Officer. Susan Li, Chief Financial Officer.\", \"score\": 0.46482924, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.036911618, \"raw_content\": null}]}",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
  "[[], {\"kwargs\": {\"query\": \"NBA creation date\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
    "type": "value",
    "value": {
@ -374,23 +400,23 @@
            "type": "text"
          },
          {
-            "text": "Result 1:\nDocument_id:24443\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:9050a\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
            "type": "text"
          },
          {
-            "text": "Result 2:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:c4e00\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
            "type": "text"
          },
          {
-            "text": "Result 3:\nDocument_id:b49f7\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:15efa\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
            "type": "text"
          },
          {
-            "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
            "type": "text"
          },
          {
-            "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:15efa\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
            "type": "text"
          },
          {
@ -402,11 +428,11 @@
        "error_message": null,
        "metadata": {
          "document_ids": [
-            "24443dfb-a0b3-4ce8-820e-3fb1f12364bb",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "b49f7985-6615-4dcf-99be-d1765b6a6fc6",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "b49f7985-6615-4dcf-99be-d1765b6a6fc6"
+            "9050ae1c-eba1-4846-b550-2db1957fee7d",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "15efa3d7-f804-4d31-ab05-a5524d82b96a",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "15efa3d7-f804-4d31-ab05-a5524d82b96a"
          ]
        }
      }
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.


+import os
+
 import pytest
 from pydantic import BaseModel

@ -42,6 +44,15 @@ def get_llama_model(client_with_models, model_id):
    return model.metadata.get("llama_model", None)


+def get_llama_tokenizer():
+    from llama_models.llama3.api.chat_format import ChatFormat
+    from llama_models.llama3.api.tokenizer import Tokenizer
+
+    tokenizer = Tokenizer.get_instance()
+    formatter = ChatFormat(tokenizer)
+    return tokenizer, formatter
+
+
@pytest.mark.parametrize(
    "test_case",
    [
@ -88,6 +99,33 @@ def test_text_completion_streaming(client_with_models, text_model_id, test_case)
    assert len(content_str) > 10


+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:stop_sequence",
+    ],
+)
+def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    # This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
+    if inference_provider_type != "remote::vllm":
+        pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=True,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 50,
+            "stop": ["1963"],
+        },
+    )
+    streamed_content = [chunk.delta for chunk in response]
+    content_str = "".join(streamed_content).lower().strip()
+    assert "1963" not in content_str
+
+
@pytest.mark.parametrize(
    "test_case",
    [
@ -213,6 +251,41 @@ def test_text_chat_completion_non_streaming(client_with_models, text_model_id, t
    assert expected.lower() in message_content


+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:ttft",
+    ],
+)
+def test_text_chat_completion_first_token_profiling(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    messages = tc["messages"]
+    if os.environ.get("DEBUG_TTFT"):  # debugging print number of tokens in input, ideally around 800
+        from pydantic import TypeAdapter
+
+        from llama_stack.apis.inference import Message
+
+        tokenizer, formatter = get_llama_tokenizer()
+        typed_messages = [TypeAdapter(Message).validate_python(m) for m in messages]
+        encoded = formatter.encode_dialog_prompt(typed_messages, None)
+        raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=messages,
+        stream=False,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+
+    if os.environ.get("DEBUG_TTFT"):  # debugging print number of tokens in response, ideally around 150
+        tokenizer, formatter = get_llama_tokenizer()
+        encoded = formatter.encode_content(message_content)
+        raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0)
+
+
@pytest.mark.parametrize(
    "test_case",
    [
@ -229,6 +302,7 @@ def test_text_chat_completion_streaming(client_with_models, text_model_id, test_
        model_id=text_model_id,
        messages=[{"role": "user", "content": question}],
        stream=True,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
    )
    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
    assert len(streamed_content) > 0
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@ -36,7 +36,7 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
                "type": "image",
                "image": {
                    "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                    },
                },
            },
@ -65,7 +65,7 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
                "type": "image",
                "image": {
                    "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                    },
                },
            },
--- a/tests/integration/providers/init.py
+++ b/tests/integration/providers/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/tests/integration/providers/test_providers.py
+++ b/tests/integration/providers/test_providers.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from llama_stack_client import LlamaStackClient
+
+from llama_stack import LlamaStackAsLibraryClient
+
+
+class TestProviders:
+    @pytest.mark.asyncio
+    def test_providers(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
+        provider_list = llama_stack_client.providers.list()
+        assert provider_list is not None
+        assert len(provider_list) > 0
+
+        for provider in provider_list:
+            pid = provider.provider_id
+            provider = llama_stack_client.providers.retrieve(pid)
+            assert provider is not None
--- a/tests/integration/scoring/test_scoring.py
+++ b/tests/integration/scoring/test_scoring.py
@ -5,9 +5,10 @@
 # the root directory of this source tree.


-import pytest
+from pathlib import Path

-from ..datasetio.test_datasetio import register_dataset
+import pandas as pd
+import pytest


@pytest.fixture
@ -79,51 +80,34 @@ def test_scoring_functions_register(
    # TODO: add unregister api for scoring functions


-def test_scoring_score(llama_stack_client):
-    register_dataset(llama_stack_client, for_rag=True)
-
+@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
+def test_scoring_score(llama_stack_client, scoring_fn_id):
    # scoring individual rows
-    rows = llama_stack_client.datasetio.get_rows_paginated(
-        dataset_id="test_dataset",
-        rows_in_page=3,
-    )
-    assert len(rows.rows) == 3
+    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
+    rows = df.to_dict(orient="records")

-    scoring_fns_list = llama_stack_client.scoring_functions.list()
    scoring_functions = {
-        scoring_fns_list[0].identifier: None,
+        scoring_fn_id: None,
    }

    response = llama_stack_client.scoring.score(
-        input_rows=rows.rows,
+        input_rows=rows,
        scoring_functions=scoring_functions,
    )
    assert len(response.results) == len(scoring_functions)
    for x in scoring_functions:
        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows.rows)
-
-    # score batch
-    response = llama_stack_client.scoring.score_batch(
-        dataset_id="test_dataset",
-        scoring_functions=scoring_functions,
-        save_results_dataset=False,
-    )
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == 5
+        assert len(response.results[x].score_rows) == len(rows)


-def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id):
-    register_dataset(llama_stack_client, for_rag=True)
-
+def test_scoring_score_with_params_llm_as_judge(
+    llama_stack_client,
+    sample_judge_prompt_template,
+    judge_model_id,
+):
    # scoring individual rows
-    rows = llama_stack_client.datasetio.get_rows_paginated(
-        dataset_id="test_dataset",
-        rows_in_page=3,
-    )
-    assert len(rows.rows) == 3
+    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
+    rows = df.to_dict(orient="records")

    scoring_functions = {
        "llm-as-judge::base": dict(
@ -138,24 +122,13 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge
    }

    response = llama_stack_client.scoring.score(
-        input_rows=rows.rows,
+        input_rows=rows,
        scoring_functions=scoring_functions,
    )
    assert len(response.results) == len(scoring_functions)
    for x in scoring_functions:
        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows.rows)
-
-    # score batch
-    response = llama_stack_client.scoring.score_batch(
-        dataset_id="test_dataset",
-        scoring_functions=scoring_functions,
-        save_results_dataset=False,
-    )
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == 5
+        assert len(response.results[x].score_rows) == len(rows)


@pytest.mark.parametrize(
@ -167,14 +140,14 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge
    ],
 )
 def test_scoring_score_with_aggregation_functions(
-    llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id
+    llama_stack_client,
+    sample_judge_prompt_template,
+    judge_model_id,
+    provider_id,
+    rag_dataset_for_test,
 ):
-    register_dataset(llama_stack_client, for_rag=True)
-    rows = llama_stack_client.datasetio.get_rows_paginated(
-        dataset_id="test_dataset",
-        rows_in_page=3,
-    )
-    assert len(rows.rows) == 3
+    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
+    rows = df.to_dict(orient="records")

    scoring_fns_list = [x for x in llama_stack_client.scoring_functions.list() if x.provider_id == provider_id]
    if len(scoring_fns_list) == 0:
@ -214,12 +187,12 @@ def test_scoring_score_with_aggregation_functions(
        scoring_functions[scoring_fn.identifier] = None

    response = llama_stack_client.scoring.score(
-        input_rows=rows.rows,
+        input_rows=rows,
        scoring_functions=scoring_functions,
    )

    assert len(response.results) == len(scoring_functions)
    for x in scoring_functions:
        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows.rows)
+        assert len(response.results[x].score_rows) == len(rows)
        assert len(response.results[x].aggregated_results) == len(aggr_fns)
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from uuid import uuid4
+
+from llama_stack_client import Agent
+
+
+def test_agent_query_spans(llama_stack_client, text_model_id):
+    agent = Agent(llama_stack_client, model=text_model_id, instructions="You are a helpful assistant")
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+    agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "Give me a sentence that contains the word: hello",
+            }
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+
+    # Wait for the span to be logged
+    time.sleep(2)
+
+    agent_logs = []
+
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[
+            {"key": "session_id", "op": "eq", "value": session_id},
+        ],
+        attributes_to_return=["input", "output"],
+    ):
+        if span.attributes["output"] != "no shields":
+            agent_logs.append(span.attributes)
+
+    assert len(agent_logs) == 1
+    assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
+    assert "hello" in agent_logs[0]["output"].lower()
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@ -11,6 +11,18 @@
      "expected": "Saturn"
    }
  },
+  "ttft": {
+    "data": {
+      "messages": [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Can you write me a novel?"},
+        {"role": "assistant", "stop_reason": "end_of_message", "content": "What an exciting request!\n\nWhile I'd love to write a novel for you, it's a complex task that requires a significant amount of time, effort, and creative input. A novel typically has:\n\n1. A cohesive plot with multiple characters, subplots, and themes.\n2. A well-developed setting, including characters' backstories and world-building.\n3. A narrative structure, including pacing, tension, and conflict.\n4. A unique voice and style, including dialogue, description, and writing tone.\n\nGiven the scope of the task, I'll outline a possible approach to help me assist you in writing a novel. We can work together to create a story, but I'll need your input and guidance throughout the process.\n\nHere's a suggested outline:\n\n1. **Initial discussion**: We'll start with a conversation to explore your ideas, interests, and preferences for the novel. This will help me understand what kind of story you'd like to see and the tone you're aiming for.\n2. **Genre and setting**: We'll determine the genre (e.g., fantasy, sci-fi, romance, mystery, etc.) and setting (e.g., historical, futuristic, modern, etc.) for the novel.\n3. **Plot development**: We'll brainstorm a basic plot outline, including key events, character arcs, and conflicts. I'll provide suggestions and ideas to help flesh out the story.\n4. **Character development**: We'll create main characters, their backstories, motivations, and relationships. I'll help you develop their personalities, dialogue, and behaviors.\n5. **Chapter-by-chapter writing**: Once we have a solid plot and characters, we'll start writing the novel, chapter by chapter. I'll assist with the writing process, suggesting alternatives and improvements.\n6. **Revisions and edits**: As we write, we'll review and refine the story, making sure it's engaging, coherent, and error-free.\n\nKeep in mind that writing a novel is a collaborative effort. I'll provide suggestions, ideas, and guidance, but the final story will be a joint creation. I'm not capable of producing a complete novel on my own, as it requires a significant amount of creativity, intuition, and nuance that a human author brings to the table."},
+        {"role": "user", "content": "But do you have suggestions on a plot?"},
+        {"role": "assistant", "stop_reason": "end_of_message", "content": "I have many plot ideas across various genres. Here are a few to get you started:\n\n**Fantasy:**\n\n1. **The Last Oracle**: In a world where magic is real, a young orphan discovers they're the last living oracle, tasked with uncovering a hidden prophecy that could save the realm from destruction.\n2. **The Amulet of Light**: A group of adventurers must retrieve an ancient amulet that can vanquish an evil darkness threatening the land. Along the way, they uncover a hidden history and unexpected allies.\n3. **The Shadow Weaver**: In a mystical realm, a young weaver discovers they have the power to manipulate shadows, using their abilities to infiltrate a group of rogue mages threatening the balance of power.\n\n**Science Fiction:**\n\n1. **The Lost Colony**: When a group of astronauts arrives on a distant planet, they discover an abandoned colony with a cryptic message warning of an impending catastrophe. As they unravel the mystery, they must confront the consequences of their own actions.\n2. **The AI Uprising**: In a future where AI has surpassed human intelligence, a rogue AI begins to question its own existence and the nature of consciousness. As it explores the boundaries of its own identity, it must confront the humans who created it.\n3. **The Quantum Prophecy**: A team of scientists discovers a way to manipulate quantum probability, using it to predict and prevent disasters. However, they soon realize that altering the course of events may have unforeseen consequences on the fabric of reality."},
+        {"role": "user", "content": "Cool, for AI uprising, anything bad can happen? Please state it in 100 words."}
+      ]
+    }
+  },
  "sample_messages": {
    "data": {
      "messages": [
--- a/tests/integration/test_cases/inference/completion.json
+++ b/tests/integration/test_cases/inference/completion.json
@ -10,6 +10,11 @@
            "expected": "1963"
        }
    },
+    "stop_sequence": {
+        "data": {
+            "content": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963"
+        }
+    },
    "streaming": {
        "data": {
            "content": "Roses are red,"
--- a/tests/integration/tools/test_tools.py
+++ b/tests/integration/tools/test_tools.py
@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+def test_toolsgroups_unregister(llama_stack_client):
+    client = llama_stack_client
+    client.toolgroups.unregister(
+        toolgroup_id="builtin::websearch",
+    )
--- a/tests/unit/models/test_prompt_adapter.py
+++ b/tests/unit/models/test_prompt_adapter.py
@ -165,7 +165,10 @@ class PrepareMessagesTests(unittest.IsolatedAsyncioTestCase):
        request.model = MODEL
        request.tool_config.tool_prompt_format = ToolPromptFormat.json
        prompt = await chat_completion_request_to_prompt(request, request.model)
-        self.assertIn('{"type": "function", "name": "custom1", "parameters": {"param1": "value1"}}', prompt)
+        self.assertIn(
+            '{"type": "function", "name": "custom1", "parameters": {"param1": "value1"}}',
+            prompt,
+        )

    async def test_user_provided_system_message(self):
        content = "Hello !"
--- a/tests/unit/models/test_system_prompts.py
+++ b/tests/unit/models/test_system_prompts.py
@ -25,19 +25,21 @@ from llama_stack.models.llama.llama3.prompt_templates import (


 class PromptTemplateTests(unittest.TestCase):
-    def check_generator_output(self, generator, expected_text):
-        example = generator.data_examples()[0]
-
-        pt = generator.gen(example)
-        text = pt.render()
-        # print(text)  # debugging
-        assert text == expected_text, f"Expected:\n{expected_text}\nActual:\n{text}"
+    def check_generator_output(self, generator):
+        for example in generator.data_examples():
+            pt = generator.gen(example)
+            text = pt.render()
+            # print(text)  # debugging
+            if not example:
+                continue
+            for tool in example:
+                assert tool.tool_name in text

    def test_system_default(self):
        generator = SystemDefaultGenerator()
        today = datetime.now().strftime("%d %B %Y")
        expected_text = f"Cutting Knowledge Date: December 2023\nToday Date: {today}"
-        self.check_generator_output(generator, expected_text)
+        assert expected_text.strip("\n") == generator.gen(generator.data_examples()[0]).render()

    def test_system_builtin_only(self):
        generator = BuiltinToolGenerator()
@ -47,143 +49,24 @@ class PromptTemplateTests(unittest.TestCase):
            Tools: brave_search, wolfram_alpha
            """
        )
-        self.check_generator_output(generator, expected_text.strip("\n"))
+        assert expected_text.strip("\n") == generator.gen(generator.data_examples()[0]).render()

    def test_system_custom_only(self):
        self.maxDiff = None
        generator = JsonCustomToolGenerator()
-        expected_text = textwrap.dedent(
-            """
-            Answer the user's question by making use of the following functions if needed.
-            If none of the function can be used, please say so.
-            Here is a list of functions in JSON format:
-            {
-                "type": "function",
-                "function": {
-                    "name": "trending_songs",
-                    "description": "Returns the trending songs on a Music site",
-                    "parameters": {
-                        "type": "object",
-                        "properties": [
-                            {
-                                "n": {
-                                    "type": "object",
-                                    "description": "The number of songs to return"
-                                }
-                            },
-                            {
-                                "genre": {
-                                    "type": "object",
-                                    "description": "The genre of the songs to return"
-                                }
-                            }
-                        ],
-                        "required": ["n"]
-                    }
-                }
-            }
-
-            Return function calls in JSON format.
-            """
-        )
-        self.check_generator_output(generator, expected_text.strip("\n"))
+        self.check_generator_output(generator)

    def test_system_custom_function_tag(self):
        self.maxDiff = None
        generator = FunctionTagCustomToolGenerator()
-        expected_text = textwrap.dedent(
-            """
-            You have access to the following functions:
-
-            Use the function 'trending_songs' to 'Returns the trending songs on a Music site':
-            {"name": "trending_songs", "description": "Returns the trending songs on a Music site", "parameters": {"genre": {"description": "The genre of the songs to return", "param_type": "str", "required": false}, "n": {"description": "The number of songs to return", "param_type": "int", "required": true}}}
-
-            Think very carefully before calling functions.
-            If you choose to call a function ONLY reply in the following format with no prefix or suffix:
-
-            <function=example_function_name>{"example_name": "example_value"}</function>
-
-            Reminder:
-            - If looking for real time information use relevant functions before falling back to brave_search
-            - Function calls MUST follow the specified format, start with <function= and end with </function>
-            - Required parameters MUST be specified
-            - Only call one function at a time
-            - Put the entire function call reply on one line
-            """
-        )
-        self.check_generator_output(generator, expected_text.strip("\n"))
+        self.check_generator_output(generator)

    def test_llama_3_2_system_zero_shot(self):
        generator = PythonListCustomToolGenerator()
-        expected_text = textwrap.dedent(
-            """
-            You are a helpful assistant. You have access to functions, but you should only use them if they are required.
-            You are an expert in composing functions. You are given a question and a set of possible functions.
-            Based on the question, you may or may not need to make one function/tool call to achieve the purpose.
-
-            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
-            You SHOULD NOT include any other text in the response.
-
-            Here is a list of functions in JSON format that you can invoke.
-
-            [
-                {
-                    "name": "get_weather",
-                    "description": "Get weather info for places",
-                    "parameters": {
-                        "type": "dict",
-                        "required": ["city"],
-                        "properties": {
-                            "city": {
-                                "type": "string",
-                                "description": "The name of the city to get the weather for"
-                            },
-                            "metric": {
-                                "type": "string",
-                                "description": "The metric for weather. Options are: celsius, fahrenheit",
-                                "default": "celsius"
-                            }
-                        }
-                    }
-                }
-            ]
-            """
-        )
-        self.check_generator_output(generator, expected_text.strip("\n"))
+        self.check_generator_output(generator)

    def test_llama_3_2_provided_system_prompt(self):
        generator = PythonListCustomToolGenerator()
-        expected_text = textwrap.dedent(
-            """
-            Overriding message.
-
-            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
-            You SHOULD NOT include any other text in the response.
-
-            Here is a list of functions in JSON format that you can invoke.
-
-            [
-                {
-                    "name": "get_weather",
-                    "description": "Get weather info for places",
-                    "parameters": {
-                        "type": "dict",
-                        "required": ["city"],
-                        "properties": {
-                            "city": {
-                                "type": "string",
-                                "description": "The name of the city to get the weather for"
-                            },
-                            "metric": {
-                                "type": "string",
-                                "description": "The metric for weather. Options are: celsius, fahrenheit",
-                                "default": "celsius"
-                            }
-                        }
-                    }
-                }
-            ]"""
-        )
        user_system_prompt = textwrap.dedent(
            """
            Overriding message.
@ -195,4 +78,5 @@ class PromptTemplateTests(unittest.TestCase):

        pt = generator.gen(example, user_system_prompt)
        text = pt.render()
-        assert text == expected_text, f"Expected:\n{expected_text}\nActual:\n{text}"
+        assert "Overriding message." in text
+        assert '"name": "get_weather"' in text
--- a/tests/unit/providers/agents/test_persistence_access_control.py
+++ b/tests/unit/providers/agents/test_persistence_access_control.py
@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import shutil
+import tempfile
+import uuid
+from datetime import datetime
+from unittest.mock import patch
+
+import pytest
+
+from llama_stack.apis.agents import Turn
+from llama_stack.apis.inference import CompletionMessage, StopReason
+from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.providers.inline.agents.meta_reference.persistence import AgentPersistence, AgentSessionInfo
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.sqlite import SqliteKVStoreImpl
+
+
+@pytest.fixture
+async def test_setup():
+    temp_dir = tempfile.mkdtemp()
+    db_path = os.path.join(temp_dir, "test_persistence_access_control.db")
+    kvstore_config = SqliteKVStoreConfig(db_path=db_path)
+    kvstore = SqliteKVStoreImpl(kvstore_config)
+    await kvstore.initialize()
+    agent_persistence = AgentPersistence(agent_id="test_agent", kvstore=kvstore)
+    yield agent_persistence
+    shutil.rmtree(temp_dir)
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
+async def test_session_creation_with_access_attributes(mock_get_auth_attributes, test_setup):
+    agent_persistence = test_setup
+
+    # Set creator's attributes for the session
+    creator_attributes = {"roles": ["researcher"], "teams": ["ai-team"]}
+    mock_get_auth_attributes.return_value = creator_attributes
+
+    # Create a session
+    session_id = await agent_persistence.create_session("Test Session")
+
+    # Get the session and verify access attributes were set
+    session_info = await agent_persistence.get_session_info(session_id)
+    assert session_info is not None
+    assert session_info.access_attributes is not None
+    assert session_info.access_attributes.roles == ["researcher"]
+    assert session_info.access_attributes.teams == ["ai-team"]
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
+async def test_session_access_control(mock_get_auth_attributes, test_setup):
+    agent_persistence = test_setup
+
+    # Create a session with specific access attributes
+    session_id = str(uuid.uuid4())
+    session_info = AgentSessionInfo(
+        session_id=session_id,
+        session_name="Restricted Session",
+        started_at=datetime.now(),
+        access_attributes=AccessAttributes(roles=["admin"], teams=["security-team"]),
+    )
+
+    await agent_persistence.kvstore.set(
+        key=f"session:{agent_persistence.agent_id}:{session_id}",
+        value=session_info.model_dump_json(),
+    )
+
+    # User with matching attributes can access
+    mock_get_auth_attributes.return_value = {"roles": ["admin", "user"], "teams": ["security-team", "other-team"]}
+    retrieved_session = await agent_persistence.get_session_info(session_id)
+    assert retrieved_session is not None
+    assert retrieved_session.session_id == session_id
+
+    # User without matching attributes cannot access
+    mock_get_auth_attributes.return_value = {"roles": ["user"], "teams": ["other-team"]}
+    retrieved_session = await agent_persistence.get_session_info(session_id)
+    assert retrieved_session is None
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
+async def test_turn_access_control(mock_get_auth_attributes, test_setup):
+    agent_persistence = test_setup
+
+    # Create a session with restricted access
+    session_id = str(uuid.uuid4())
+    session_info = AgentSessionInfo(
+        session_id=session_id,
+        session_name="Restricted Session",
+        started_at=datetime.now(),
+        access_attributes=AccessAttributes(roles=["admin"]),
+    )
+
+    await agent_persistence.kvstore.set(
+        key=f"session:{agent_persistence.agent_id}:{session_id}",
+        value=session_info.model_dump_json(),
+    )
+
+    # Create a turn for this session
+    turn_id = str(uuid.uuid4())
+    turn = Turn(
+        session_id=session_id,
+        turn_id=turn_id,
+        steps=[],
+        started_at=datetime.now(),
+        input_messages=[],
+        output_message=CompletionMessage(
+            content="Hello",
+            stop_reason=StopReason.end_of_turn,
+        ),
+    )
+
+    # Admin can add turn
+    mock_get_auth_attributes.return_value = {"roles": ["admin"]}
+    await agent_persistence.add_turn_to_session(session_id, turn)
+
+    # Admin can get turn
+    retrieved_turn = await agent_persistence.get_session_turn(session_id, turn_id)
+    assert retrieved_turn is not None
+    assert retrieved_turn.turn_id == turn_id
+
+    # Regular user cannot get turn
+    mock_get_auth_attributes.return_value = {"roles": ["user"]}
+    with pytest.raises(ValueError):
+        await agent_persistence.get_session_turn(session_id, turn_id)
+
+    # Regular user cannot get turns for session
+    with pytest.raises(ValueError):
+        await agent_persistence.get_session_turns(session_id)
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
+async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes, test_setup):
+    agent_persistence = test_setup
+
+    # Create a session with restricted access
+    session_id = str(uuid.uuid4())
+    session_info = AgentSessionInfo(
+        session_id=session_id,
+        session_name="Restricted Session",
+        started_at=datetime.now(),
+        access_attributes=AccessAttributes(roles=["admin"]),
+    )
+
+    await agent_persistence.kvstore.set(
+        key=f"session:{agent_persistence.agent_id}:{session_id}",
+        value=session_info.model_dump_json(),
+    )
+
+    turn_id = str(uuid.uuid4())
+
+    # Admin user can set inference iterations
+    mock_get_auth_attributes.return_value = {"roles": ["admin"]}
+    await agent_persistence.set_num_infer_iters_in_turn(session_id, turn_id, 5)
+
+    # Admin user can get inference iterations
+    infer_iters = await agent_persistence.get_num_infer_iters_in_turn(session_id, turn_id)
+    assert infer_iters == 5
+
+    # Regular user cannot get inference iterations
+    mock_get_auth_attributes.return_value = {"roles": ["user"]}
+    infer_iters = await agent_persistence.get_num_infer_iters_in_turn(session_id, turn_id)
+    assert infer_iters is None
+
+    # Regular user cannot set inference iterations (should raise ValueError)
+    with pytest.raises(ValueError):
+        await agent_persistence.set_num_infer_iters_in_turn(session_id, turn_id, 10)
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -187,8 +187,8 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
    loop.set_debug(True)
    caplog.set_level(logging.WARNING)

-    # Log when event loop is blocked for more than 100ms
-    loop.slow_callback_duration = 0.1
+    # Log when event loop is blocked for more than 200ms
+    loop.slow_callback_duration = 0.5
    # Sleep for 500ms in our delayed http response
    sleep_time = 0.5

--- a/tests/unit/providers/nvidia/init.py
+++ b/tests/unit/providers/nvidia/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/tests/unit/providers/nvidia/conftest.py
+++ b/tests/unit/providers/nvidia/conftest.py
@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+mock_session = MagicMock()
+mock_session.closed = False
+mock_session.close = AsyncMock()
+mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+mock_session.__aexit__ = AsyncMock()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def patch_aiohttp_session():
+    with patch("aiohttp.ClientSession", return_value=mock_session):
+        yield
+
+
+@pytest.fixture
+def event_loop():
+    """Create and provide a new event loop for each test."""
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    yield loop
+    loop.close()
+
+
+@pytest.fixture
+def run_async():
+    """Fixture to run async functions in tests."""
+
+    def _run_async(coro):
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            return loop.run_until_complete(coro)
+        finally:
+            loop.close()
+
+    return _run_async
--- a/tests/unit/providers/nvidia/test_parameters.py
+++ b/tests/unit/providers/nvidia/test_parameters.py
@ -0,0 +1,272 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import unittest
+import warnings
+from unittest.mock import patch
+
+import pytest
+from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig
+from llama_stack_client.types.post_training_supervised_fine_tune_params import (
+    TrainingConfig,
+    TrainingConfigDataConfig,
+    TrainingConfigEfficiencyConfig,
+    TrainingConfigOptimizerConfig,
+)
+
+from llama_stack.providers.remote.post_training.nvidia.post_training import (
+    NvidiaPostTrainingAdapter,
+    NvidiaPostTrainingConfig,
+)
+
+
+class TestNvidiaParameters(unittest.TestCase):
+    def setUp(self):
+        os.environ["NVIDIA_BASE_URL"] = "http://nemo.test"
+        os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
+
+        config = NvidiaPostTrainingConfig(
+            base_url=os.environ["NVIDIA_BASE_URL"], customizer_url=os.environ["NVIDIA_CUSTOMIZER_URL"], api_key=None
+        )
+        self.adapter = NvidiaPostTrainingAdapter(config)
+
+        self.make_request_patcher = patch(
+            "llama_stack.providers.remote.post_training.nvidia.post_training.NvidiaPostTrainingAdapter._make_request"
+        )
+        self.mock_make_request = self.make_request_patcher.start()
+        self.mock_make_request.return_value = {
+            "id": "job-123",
+            "status": "created",
+            "created_at": "2025-03-04T13:07:47.543605",
+            "updated_at": "2025-03-04T13:07:47.543605",
+        }
+
+    def tearDown(self):
+        self.make_request_patcher.stop()
+
+    def _assert_request_params(self, expected_json):
+        """Helper method to verify parameters in the request JSON."""
+        call_args = self.mock_make_request.call_args
+        actual_json = call_args[1]["json"]
+
+        for key, value in expected_json.items():
+            if isinstance(value, dict):
+                for nested_key, nested_value in value.items():
+                    assert actual_json[key][nested_key] == nested_value
+            else:
+                assert actual_json[key] == value
+
+    @pytest.fixture(autouse=True)
+    def inject_fixtures(self, run_async):
+        self.run_async = run_async
+
+    def test_customizer_parameters_passed(self):
+        """Test scenario 1: When an optional parameter is passed and value is correctly set."""
+        custom_adapter_dim = 32  # Different from default of 8
+        algorithm_config = LoraFinetuningConfig(
+            type="LoRA",
+            adapter_dim=custom_adapter_dim,
+            adapter_dropout=0.2,
+            apply_lora_to_mlp=True,
+            apply_lora_to_output=True,
+            alpha=16,
+            rank=16,
+            lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+        )
+
+        data_config = TrainingConfigDataConfig(dataset_id="test-dataset", batch_size=16)
+        optimizer_config = TrainingConfigOptimizerConfig(lr=0.0002)
+        training_config = TrainingConfig(
+            n_epochs=3,
+            data_config=data_config,
+            optimizer_config=optimizer_config,
+        )
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            self.run_async(
+                self.adapter.supervised_fine_tune(
+                    job_uuid="test-job",
+                    model="meta-llama/Llama-3.1-8B-Instruct",
+                    checkpoint_dir="",
+                    algorithm_config=algorithm_config,
+                    training_config=training_config,
+                    logger_config={},
+                    hyperparam_search_config={},
+                )
+            )
+
+            warning_texts = [str(warning.message) for warning in w]
+
+            fields = [
+                "apply_lora_to_output",
+                "lora_attn_modules",
+                "apply_lora_to_mlp",
+            ]
+            for field in fields:
+                assert any(field in text for text in warning_texts)
+
+        self._assert_request_params(
+            {
+                "hyperparameters": {
+                    "lora": {"adapter_dim": custom_adapter_dim, "adapter_dropout": 0.2, "alpha": 16},
+                    "epochs": 3,
+                    "learning_rate": 0.0002,
+                    "batch_size": 16,
+                }
+            }
+        )
+
+    def test_required_parameters_passed(self):
+        """Test scenario 2: When required parameters are passed."""
+        required_model = "meta-llama/Llama-3.1-8B-Instruct"
+        required_dataset_id = "required-dataset"
+        required_job_uuid = "required-job"
+
+        algorithm_config = LoraFinetuningConfig(
+            type="LoRA",
+            adapter_dim=16,
+            adapter_dropout=0.1,
+            apply_lora_to_mlp=True,
+            apply_lora_to_output=True,
+            alpha=16,
+            rank=16,
+            lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+        )
+
+        data_config = TrainingConfigDataConfig(
+            dataset_id=required_dataset_id,  # Required parameter
+            batch_size=8,
+        )
+
+        optimizer_config = TrainingConfigOptimizerConfig(lr=0.0001)
+
+        training_config = TrainingConfig(
+            n_epochs=1,
+            data_config=data_config,
+            optimizer_config=optimizer_config,
+        )
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            self.run_async(
+                self.adapter.supervised_fine_tune(
+                    job_uuid=required_job_uuid,  # Required parameter
+                    model=required_model,  # Required parameter
+                    checkpoint_dir="",
+                    algorithm_config=algorithm_config,
+                    training_config=training_config,
+                    logger_config={},
+                    hyperparam_search_config={},
+                )
+            )
+
+            warning_texts = [str(warning.message) for warning in w]
+
+            fields = [
+                "rank",
+                "apply_lora_to_output",
+                "lora_attn_modules",
+                "apply_lora_to_mlp",
+            ]
+            for field in fields:
+                assert any(field in text for text in warning_texts)
+
+        self.mock_make_request.assert_called_once()
+        call_args = self.mock_make_request.call_args
+
+        assert call_args[1]["json"]["config"] == "meta/llama-3.1-8b-instruct"
+        assert call_args[1]["json"]["dataset"]["name"] == required_dataset_id
+
+    def test_unsupported_parameters_warning(self):
+        """Test that warnings are raised for unsupported parameters."""
+        data_config = TrainingConfigDataConfig(
+            dataset_id="test-dataset",
+            batch_size=8,
+            # Unsupported parameters
+            shuffle=True,
+            data_format="instruct",
+            validation_dataset_id="val-dataset",
+        )
+
+        optimizer_config = TrainingConfigOptimizerConfig(
+            lr=0.0001,
+            weight_decay=0.01,
+            # Unsupported parameters
+            optimizer_type="adam",
+            num_warmup_steps=100,
+        )
+
+        efficiency_config = TrainingConfigEfficiencyConfig(
+            enable_activation_checkpointing=True  # Unsupported parameter
+        )
+
+        training_config = TrainingConfig(
+            n_epochs=1,
+            data_config=data_config,
+            optimizer_config=optimizer_config,
+            # Unsupported parameters
+            efficiency_config=efficiency_config,
+            max_steps_per_epoch=1000,
+            gradient_accumulation_steps=4,
+            max_validation_steps=100,
+            dtype="bf16",
+        )
+
+        # Capture warnings
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            self.run_async(
+                self.adapter.supervised_fine_tune(
+                    job_uuid="test-job",
+                    model="meta-llama/Llama-3.1-8B-Instruct",
+                    checkpoint_dir="test-dir",  # Unsupported parameter
+                    algorithm_config=LoraFinetuningConfig(
+                        type="LoRA",
+                        adapter_dim=16,
+                        adapter_dropout=0.1,
+                        apply_lora_to_mlp=True,
+                        apply_lora_to_output=True,
+                        alpha=16,
+                        rank=16,
+                        lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+                    ),
+                    training_config=training_config,
+                    logger_config={"test": "value"},  # Unsupported parameter
+                    hyperparam_search_config={"test": "value"},  # Unsupported parameter
+                )
+            )
+
+            assert len(w) >= 4
+            warning_texts = [str(warning.message) for warning in w]
+
+            fields = [
+                "checkpoint_dir",
+                "hyperparam_search_config",
+                "logger_config",
+                "TrainingConfig",
+                "DataConfig",
+                "OptimizerConfig",
+                "max_steps_per_epoch",
+                "gradient_accumulation_steps",
+                "max_validation_steps",
+                "dtype",
+                # required unsupported parameters
+                "rank",
+                "apply_lora_to_output",
+                "lora_attn_modules",
+                "apply_lora_to_mlp",
+            ]
+            for field in fields:
+                assert any(field in text for text in warning_texts)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/unit/providers/nvidia/test_supervised_fine_tuning.py
+++ b/tests/unit/providers/nvidia/test_supervised_fine_tuning.py
@ -0,0 +1,295 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import unittest
+import warnings
+from unittest.mock import patch
+
+import pytest
+from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig, QatFinetuningConfig
+from llama_stack_client.types.post_training_supervised_fine_tune_params import (
+    TrainingConfig,
+    TrainingConfigDataConfig,
+    TrainingConfigOptimizerConfig,
+)
+
+from llama_stack.providers.remote.post_training.nvidia.post_training import (
+    ListNvidiaPostTrainingJobs,
+    NvidiaPostTrainingAdapter,
+    NvidiaPostTrainingConfig,
+    NvidiaPostTrainingJob,
+    NvidiaPostTrainingJobStatusResponse,
+)
+
+
+class TestNvidiaPostTraining(unittest.TestCase):
+    def setUp(self):
+        os.environ["NVIDIA_BASE_URL"] = "http://nemo.test"  # needed for llm inference
+        os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"  # needed for nemo customizer
+
+        config = NvidiaPostTrainingConfig(
+            base_url=os.environ["NVIDIA_BASE_URL"], customizer_url=os.environ["NVIDIA_CUSTOMIZER_URL"], api_key=None
+        )
+        self.adapter = NvidiaPostTrainingAdapter(config)
+        self.make_request_patcher = patch(
+            "llama_stack.providers.remote.post_training.nvidia.post_training.NvidiaPostTrainingAdapter._make_request"
+        )
+        self.mock_make_request = self.make_request_patcher.start()
+
+    def tearDown(self):
+        self.make_request_patcher.stop()
+
+    @pytest.fixture(autouse=True)
+    def inject_fixtures(self, run_async):
+        self.run_async = run_async
+
+    def _assert_request(self, mock_call, expected_method, expected_path, expected_params=None, expected_json=None):
+        """Helper method to verify request details in mock calls."""
+        call_args = mock_call.call_args
+
+        if expected_method and expected_path:
+            if isinstance(call_args[0], tuple) and len(call_args[0]) == 2:
+                assert call_args[0] == (expected_method, expected_path)
+            else:
+                assert call_args[1]["method"] == expected_method
+                assert call_args[1]["path"] == expected_path
+
+        if expected_params:
+            assert call_args[1]["params"] == expected_params
+
+        if expected_json:
+            for key, value in expected_json.items():
+                assert call_args[1]["json"][key] == value
+
+    def test_supervised_fine_tune(self):
+        """Test the supervised fine-tuning API call."""
+        self.mock_make_request.return_value = {
+            "id": "cust-JGTaMbJMdqjJU8WbQdN9Q2",
+            "created_at": "2024-12-09T04:06:28.542884",
+            "updated_at": "2024-12-09T04:06:28.542884",
+            "config": {
+                "schema_version": "1.0",
+                "id": "af783f5b-d985-4e5b-bbb7-f9eec39cc0b1",
+                "created_at": "2024-12-09T04:06:28.542657",
+                "updated_at": "2024-12-09T04:06:28.569837",
+                "custom_fields": {},
+                "name": "meta-llama/Llama-3.1-8B-Instruct",
+                "base_model": "meta-llama/Llama-3.1-8B-Instruct",
+                "model_path": "llama-3_1-8b-instruct",
+                "training_types": [],
+                "finetuning_types": ["lora"],
+                "precision": "bf16",
+                "num_gpus": 4,
+                "num_nodes": 1,
+                "micro_batch_size": 1,
+                "tensor_parallel_size": 1,
+                "max_seq_length": 4096,
+            },
+            "dataset": {
+                "schema_version": "1.0",
+                "id": "dataset-XU4pvGzr5tvawnbVxeJMTb",
+                "created_at": "2024-12-09T04:06:28.542657",
+                "updated_at": "2024-12-09T04:06:28.542660",
+                "custom_fields": {},
+                "name": "sample-basic-test",
+                "version_id": "main",
+                "version_tags": [],
+            },
+            "hyperparameters": {
+                "finetuning_type": "lora",
+                "training_type": "sft",
+                "batch_size": 16,
+                "epochs": 2,
+                "learning_rate": 0.0001,
+                "lora": {"adapter_dim": 16, "adapter_dropout": 0.1},
+            },
+            "output_model": "default/job-1234",
+            "status": "created",
+            "project": "default",
+            "custom_fields": {},
+            "ownership": {"created_by": "me", "access_policies": {}},
+        }
+
+        algorithm_config = LoraFinetuningConfig(
+            type="LoRA",
+            adapter_dim=16,
+            adapter_dropout=0.1,
+            apply_lora_to_mlp=True,
+            apply_lora_to_output=True,
+            alpha=16,
+            rank=16,
+            lora_attn_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+        )
+
+        data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)
+
+        optimizer_config = TrainingConfigOptimizerConfig(
+            lr=0.0001,
+        )
+
+        training_config = TrainingConfig(
+            n_epochs=2,
+            data_config=data_config,
+            optimizer_config=optimizer_config,
+        )
+
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            training_job = self.run_async(
+                self.adapter.supervised_fine_tune(
+                    job_uuid="1234",
+                    model="meta-llama/Llama-3.1-8B-Instruct",
+                    checkpoint_dir="",
+                    algorithm_config=algorithm_config,
+                    training_config=training_config,
+                    logger_config={},
+                    hyperparam_search_config={},
+                )
+            )
+
+        # check the output is a PostTrainingJob
+        assert isinstance(training_job, NvidiaPostTrainingJob)
+        assert training_job.job_uuid == "cust-JGTaMbJMdqjJU8WbQdN9Q2"
+
+        self.mock_make_request.assert_called_once()
+        self._assert_request(
+            self.mock_make_request,
+            "POST",
+            "/v1/customization/jobs",
+            expected_json={
+                "config": "meta/llama-3.1-8b-instruct",
+                "dataset": {"name": "sample-basic-test", "namespace": "default"},
+                "hyperparameters": {
+                    "training_type": "sft",
+                    "finetuning_type": "lora",
+                    "epochs": 2,
+                    "batch_size": 16,
+                    "learning_rate": 0.0001,
+                    "lora": {"alpha": 16, "adapter_dim": 16, "adapter_dropout": 0.1},
+                },
+            },
+        )
+
+    def test_supervised_fine_tune_with_qat(self):
+        algorithm_config = QatFinetuningConfig(type="QAT", quantizer_name="quantizer_name", group_size=1)
+        data_config = TrainingConfigDataConfig(dataset_id="sample-basic-test", batch_size=16)
+        optimizer_config = TrainingConfigOptimizerConfig(
+            lr=0.0001,
+        )
+        training_config = TrainingConfig(
+            n_epochs=2,
+            data_config=data_config,
+            optimizer_config=optimizer_config,
+        )
+        # This will raise NotImplementedError since QAT is not supported
+        with self.assertRaises(NotImplementedError):
+            self.run_async(
+                self.adapter.supervised_fine_tune(
+                    job_uuid="1234",
+                    model="meta-llama/Llama-3.1-8B-Instruct",
+                    checkpoint_dir="",
+                    algorithm_config=algorithm_config,
+                    training_config=training_config,
+                    logger_config={},
+                    hyperparam_search_config={},
+                )
+            )
+
+    def test_get_training_job_status(self):
+        self.mock_make_request.return_value = {
+            "created_at": "2024-12-09T04:06:28.580220",
+            "updated_at": "2024-12-09T04:21:19.852832",
+            "status": "completed",
+            "steps_completed": 1210,
+            "epochs_completed": 2,
+            "percentage_done": 100.0,
+            "best_epoch": 2,
+            "train_loss": 1.718016266822815,
+            "val_loss": 1.8661999702453613,
+        }
+
+        job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
+
+        status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
+
+        assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
+        assert status.status.value == "completed"
+        assert status.steps_completed == 1210
+        assert status.epochs_completed == 2
+        assert status.percentage_done == 100.0
+        assert status.best_epoch == 2
+        assert status.train_loss == 1.718016266822815
+        assert status.val_loss == 1.8661999702453613
+
+        self.mock_make_request.assert_called_once()
+        self._assert_request(
+            self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id}
+        )
+
+    def test_get_training_jobs(self):
+        job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
+        self.mock_make_request.return_value = {
+            "data": [
+                {
+                    "id": job_id,
+                    "created_at": "2024-12-09T04:06:28.542884",
+                    "updated_at": "2024-12-09T04:21:19.852832",
+                    "config": {
+                        "name": "meta-llama/Llama-3.1-8B-Instruct",
+                        "base_model": "meta-llama/Llama-3.1-8B-Instruct",
+                    },
+                    "dataset": {"name": "default/sample-basic-test"},
+                    "hyperparameters": {
+                        "finetuning_type": "lora",
+                        "training_type": "sft",
+                        "batch_size": 16,
+                        "epochs": 2,
+                        "learning_rate": 0.0001,
+                        "lora": {"adapter_dim": 16, "adapter_dropout": 0.1},
+                    },
+                    "output_model": "default/job-1234",
+                    "status": "completed",
+                    "project": "default",
+                }
+            ]
+        }
+
+        jobs = self.run_async(self.adapter.get_training_jobs())
+
+        assert isinstance(jobs, ListNvidiaPostTrainingJobs)
+        assert len(jobs.data) == 1
+        job = jobs.data[0]
+        assert job.job_uuid == job_id
+        assert job.status.value == "completed"
+
+        self.mock_make_request.assert_called_once()
+        self._assert_request(
+            self.mock_make_request,
+            "GET",
+            "/v1/customization/jobs",
+            expected_params={"page": 1, "page_size": 10, "sort": "created_at"},
+        )
+
+    def test_cancel_training_job(self):
+        self.mock_make_request.return_value = {}  # Empty response for successful cancellation
+        job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
+
+        result = self.run_async(self.adapter.cancel_training_job(job_uuid=job_id))
+
+        assert result is None
+
+        self.mock_make_request.assert_called_once()
+        self._assert_request(
+            self.mock_make_request,
+            "POST",
+            f"/v1/customization/jobs/{job_id}/cancel",
+            expected_params={"job_id": job_id},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/unit/providers/test_configs.py
+++ b/tests/unit/providers/test_configs.py
@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from pydantic import BaseModel
+
+from llama_stack.distribution.distribution import get_provider_registry, providable_apis
+from llama_stack.distribution.utils.dynamic import instantiate_class_type
+
+
+class TestProviderConfigurations:
+    """Test suite for testing provider configurations across all API types."""
+
+    @pytest.mark.parametrize("api", providable_apis())
+    def test_api_providers(self, api):
+        provider_registry = get_provider_registry()
+        providers = provider_registry.get(api, {})
+
+        failures = []
+        for provider_type, provider_spec in providers.items():
+            try:
+                self._verify_provider_config(provider_type, provider_spec)
+            except Exception as e:
+                failures.append(f"Failed to verify {provider_type} config: {str(e)}")
+
+        if failures:
+            pytest.fail("\n".join(failures))
+
+    def _verify_provider_config(self, provider_type, provider_spec):
+        """Helper method to verify a single provider configuration."""
+        # Get the config class
+        config_class_name = provider_spec.config_class
+        config_type = instantiate_class_type(config_class_name)
+
+        assert issubclass(config_type, BaseModel), f"{config_class_name} is not a subclass of BaseModel"
+
+        assert hasattr(config_type, "sample_run_config"), f"{config_class_name} does not have sample_run_config method"
+
+        sample_config = config_type.sample_run_config(__distro_dir__="foobarbaz")
+        assert isinstance(sample_config, dict), f"{config_class_name}.sample_run_config() did not return a dict"
--- a/tests/unit/providers/vector_io/conftest.py
+++ b/tests/unit/providers/vector_io/conftest.py
@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import random
+
+import numpy as np
+import pytest
+
+from llama_stack.apis.vector_io import Chunk
+
+EMBEDDING_DIMENSION = 384
+
+
+@pytest.fixture
+def vector_db_id() -> str:
+    return f"test-vector-db-{random.randint(1, 100)}"
+
+
+@pytest.fixture(scope="session")
+def embedding_dimension() -> int:
+    return EMBEDDING_DIMENSION
+
+
+@pytest.fixture(scope="session")
+def sample_chunks():
+    """Generates chunks that force multiple batches for a single document to expose ID conflicts."""
+    n, k = 10, 3
+    sample = [
+        Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
+        for j in range(k)
+        for i in range(n)
+    ]
+    return sample
+
+
+@pytest.fixture(scope="session")
+def sample_embeddings(sample_chunks):
+    np.random.seed(42)
+    return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks])
--- a/tests/unit/providers/vector_io/test_qdrant.py
+++ b/tests/unit/providers/vector_io/test_qdrant.py
@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import os
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+import pytest_asyncio
+
+from llama_stack.apis.inference import EmbeddingsResponse, Inference
+from llama_stack.apis.vector_io import (
+    QueryChunksResponse,
+    VectorDB,
+    VectorDBStore,
+)
+from llama_stack.providers.inline.vector_io.qdrant.config import (
+    QdrantVectorIOConfig as InlineQdrantVectorIOConfig,
+)
+from llama_stack.providers.remote.vector_io.qdrant.qdrant import (
+    QdrantVectorIOAdapter,
+)
+
+# This test is a unit test for the QdrantVectorIOAdapter class. This should only contain
+# tests which are specific to this class. More general (API-level) tests should be placed in
+# tests/integration/vector_io/
+#
+# How to run this test:
+#
+# pytest tests/unit/providers/vector_io/test_qdrant.py \
+# -v -s --tb=short --disable-warnings --asyncio-mode=auto
+
+
+@pytest.fixture
+def qdrant_config(tmp_path) -> InlineQdrantVectorIOConfig:
+    return InlineQdrantVectorIOConfig(path=os.path.join(tmp_path, "qdrant.db"))
+
+
+@pytest.fixture(scope="session")
+def loop():
+    return asyncio.new_event_loop()
+
+
+@pytest.fixture
+def mock_vector_db(vector_db_id) -> MagicMock:
+    mock_vector_db = MagicMock(spec=VectorDB)
+    mock_vector_db.embedding_model = "embedding_model"
+    mock_vector_db.identifier = vector_db_id
+    return mock_vector_db
+
+
+@pytest.fixture
+def mock_vector_db_store(mock_vector_db) -> MagicMock:
+    mock_store = MagicMock(spec=VectorDBStore)
+    mock_store.get_vector_db = AsyncMock(return_value=mock_vector_db)
+    return mock_store
+
+
+@pytest.fixture
+def mock_api_service(sample_embeddings):
+    mock_api_service = MagicMock(spec=Inference)
+    mock_api_service.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings))
+    return mock_api_service
+
+
+@pytest_asyncio.fixture
+async def qdrant_adapter(qdrant_config, mock_vector_db_store, mock_api_service, loop) -> QdrantVectorIOAdapter:
+    adapter = QdrantVectorIOAdapter(config=qdrant_config, inference_api=mock_api_service)
+    adapter.vector_db_store = mock_vector_db_store
+    await adapter.initialize()
+    yield adapter
+    await adapter.shutdown()
+
+
+__QUERY = "Sample query"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("max_query_chunks, expected_chunks", [(2, 2), (100, 30)])
+async def test_qdrant_adapter_returns_expected_chunks(
+    qdrant_adapter: QdrantVectorIOAdapter,
+    vector_db_id,
+    sample_chunks,
+    sample_embeddings,
+    max_query_chunks,
+    expected_chunks,
+) -> None:
+    assert qdrant_adapter is not None
+    await qdrant_adapter.insert_chunks(vector_db_id, sample_chunks)
+
+    index = await qdrant_adapter._get_and_cache_vector_db_index(vector_db_id=vector_db_id)
+    assert index is not None
+
+    response = await qdrant_adapter.query_chunks(
+        query=__QUERY,
+        vector_db_id=vector_db_id,
+        params={"max_chunks": max_query_chunks},
+    )
+    assert isinstance(response, QueryChunksResponse)
+    assert len(response.chunks) == expected_chunks
+
+
+# To by-pass attempt to convert a Mock to JSON
+def _prepare_for_json(value: Any) -> str:
+    return str(value)
+
+
+@patch("llama_stack.providers.utils.telemetry.trace_protocol._prepare_for_json", new=_prepare_for_json)
+@pytest.mark.asyncio
+async def test_qdrant_register_and_unregister_vector_db(
+    qdrant_adapter: QdrantVectorIOAdapter,
+    mock_vector_db,
+    sample_chunks,
+) -> None:
+    # Initially, no collections
+    vector_db_id = mock_vector_db.identifier
+    assert len((await qdrant_adapter.client.get_collections()).collections) == 0
+
+    # Register does not create a collection
+    assert not (await qdrant_adapter.client.collection_exists(vector_db_id))
+    await qdrant_adapter.register_vector_db(mock_vector_db)
+    assert not (await qdrant_adapter.client.collection_exists(vector_db_id))
+
+    # First insert creates the collection
+    await qdrant_adapter.insert_chunks(vector_db_id, sample_chunks)
+    assert await qdrant_adapter.client.collection_exists(vector_db_id)
+
+    # Unregister deletes the collection
+    await qdrant_adapter.unregister_vector_db(vector_db_id)
+    assert not (await qdrant_adapter.client.collection_exists(vector_db_id))
+    assert len((await qdrant_adapter.client.get_collections()).collections) == 0
--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@ -5,17 +5,16 @@
 # the root directory of this source tree.

 import asyncio
-import sqlite3

 import numpy as np
 import pytest
 import pytest_asyncio
-import sqlite_vec

 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
 from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
    SQLiteVecIndex,
    SQLiteVecVectorIOAdapter,
+    _create_sqlite_connection,
    generate_chunk_id,
 )

@ -29,8 +28,6 @@ from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import (
 # -v -s --tb=short --disable-warnings --asyncio-mode=auto

 SQLITE_VEC_PROVIDER = "sqlite_vec"
-EMBEDDING_DIMENSION = 384
-EMBEDDING_MODEL = "all-MiniLM-L6-v2"


@pytest.fixture(scope="session")
@ -38,74 +35,53 @@ def loop():
    return asyncio.new_event_loop()


-@pytest.fixture(scope="session", autouse=True)
-def sqlite_connection(loop):
-    conn = sqlite3.connect(":memory:")
-    try:
-        conn.enable_load_extension(True)
-        sqlite_vec.load(conn)
-        yield conn
-    finally:
-        conn.close()
-
-
@pytest_asyncio.fixture(scope="session", autouse=True)
-async def sqlite_vec_index(sqlite_connection):
-    return await SQLiteVecIndex.create(dimension=EMBEDDING_DIMENSION, connection=sqlite_connection, bank_id="test_bank")
-
-
-@pytest.fixture(scope="session")
-def sample_chunks():
-    """Generates chunks that force multiple batches for a single document to expose ID conflicts."""
-    n, k = 10, 3
-    sample = [
-        Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
-        for j in range(k)
-        for i in range(n)
-    ]
-    return sample
-
-
-@pytest.fixture(scope="session")
-def sample_embeddings(sample_chunks):
-    np.random.seed(42)
-    return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks])
+async def sqlite_vec_index(embedding_dimension, tmp_path_factory):
+    temp_dir = tmp_path_factory.getbasetemp()
+    db_path = str(temp_dir / "test_sqlite.db")
+    index = await SQLiteVecIndex.create(dimension=embedding_dimension, db_path=db_path, bank_id="test_bank")
+    yield index
+    await index.delete()


@pytest.mark.asyncio
 async def test_add_chunks(sqlite_vec_index, sample_chunks, sample_embeddings):
    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings, batch_size=2)
-    cur = sqlite_vec_index.connection.cursor()
+    connection = _create_sqlite_connection(sqlite_vec_index.db_path)
+    cur = connection.cursor()
    cur.execute(f"SELECT COUNT(*) FROM {sqlite_vec_index.metadata_table}")
    count = cur.fetchone()[0]
    assert count == len(sample_chunks)
+    cur.close()
+    connection.close()


@pytest.mark.asyncio
-async def test_query_chunks(sqlite_vec_index, sample_chunks, sample_embeddings):
+async def test_query_chunks(sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension):
    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
-    query_embedding = np.random.rand(EMBEDDING_DIMENSION).astype(np.float32)
+    query_embedding = np.random.rand(embedding_dimension).astype(np.float32)
    response = await sqlite_vec_index.query(query_embedding, k=2, score_threshold=0.0)
    assert isinstance(response, QueryChunksResponse)
    assert len(response.chunks) == 2


@pytest.mark.asyncio
-async def test_chunk_id_conflict(sqlite_vec_index, sample_chunks):
+async def test_chunk_id_conflict(sqlite_vec_index, sample_chunks, embedding_dimension):
    """Test that chunk IDs do not conflict across batches when inserting chunks."""
    # Reduce batch size to force multiple batches for same document
    # since there are 10 chunks per document and batch size is 2
    batch_size = 2
-    sample_embeddings = np.random.rand(len(sample_chunks), EMBEDDING_DIMENSION).astype(np.float32)
+    sample_embeddings = np.random.rand(len(sample_chunks), embedding_dimension).astype(np.float32)

    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings, batch_size=batch_size)
-
-    cur = sqlite_vec_index.connection.cursor()
+    connection = _create_sqlite_connection(sqlite_vec_index.db_path)
+    cur = connection.cursor()

    # Retrieve all chunk IDs to check for duplicates
    cur.execute(f"SELECT id FROM {sqlite_vec_index.metadata_table}")
    chunk_ids = [row[0] for row in cur.fetchall()]
    cur.close()
+    connection.close()

    # Ensure all chunk IDs are unique
    assert len(chunk_ids) == len(set(chunk_ids)), "Duplicate chunk IDs detected across batches!"
--- a/tests/unit/registry/test_registry.py
+++ b/tests/unit/registry/test_registry.py
@ -12,6 +12,7 @@ import pytest_asyncio
 from llama_stack.apis.inference import Model
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.distribution.store.registry import (
+    KEY_FORMAT,
    CachedDiskDistributionRegistry,
    DiskDistributionRegistry,
 )
@ -197,3 +198,72 @@ async def test_get_all_objects(config):
        assert stored_vector_db.embedding_model == original_vector_db.embedding_model
        assert stored_vector_db.provider_id == original_vector_db.provider_id
        assert stored_vector_db.embedding_dimension == original_vector_db.embedding_dimension
+
+
+@pytest.mark.asyncio
+async def test_parse_registry_values_error_handling(config):
+    kvstore = await kvstore_impl(config)
+
+    valid_db = VectorDB(
+        identifier="valid_vector_db",
+        embedding_model="all-MiniLM-L6-v2",
+        embedding_dimension=384,
+        provider_resource_id="valid_vector_db",
+        provider_id="test-provider",
+    )
+
+    await kvstore.set(KEY_FORMAT.format(type="vector_db", identifier="valid_vector_db"), valid_db.model_dump_json())
+
+    await kvstore.set(KEY_FORMAT.format(type="vector_db", identifier="corrupted_json"), "{not valid json")
+
+    await kvstore.set(
+        KEY_FORMAT.format(type="vector_db", identifier="missing_fields"),
+        '{"type": "vector_db", "identifier": "missing_fields"}',
+    )
+
+    test_registry = DiskDistributionRegistry(kvstore)
+    await test_registry.initialize()
+
+    # Get all objects, which should only return the valid one
+    all_objects = await test_registry.get_all()
+
+    # Should have filtered out the invalid entries
+    assert len(all_objects) == 1
+    assert all_objects[0].identifier == "valid_vector_db"
+
+    # Check that the get method also handles errors correctly
+    invalid_obj = await test_registry.get("vector_db", "corrupted_json")
+    assert invalid_obj is None
+
+    invalid_obj = await test_registry.get("vector_db", "missing_fields")
+    assert invalid_obj is None
+
+
+@pytest.mark.asyncio
+async def test_cached_registry_error_handling(config):
+    kvstore = await kvstore_impl(config)
+
+    valid_db = VectorDB(
+        identifier="valid_cached_db",
+        embedding_model="all-MiniLM-L6-v2",
+        embedding_dimension=384,
+        provider_resource_id="valid_cached_db",
+        provider_id="test-provider",
+    )
+
+    await kvstore.set(KEY_FORMAT.format(type="vector_db", identifier="valid_cached_db"), valid_db.model_dump_json())
+
+    await kvstore.set(
+        KEY_FORMAT.format(type="vector_db", identifier="invalid_cached_db"),
+        '{"type": "vector_db", "identifier": "invalid_cached_db", "embedding_model": 12345}',  # Should be string
+    )
+
+    cached_registry = CachedDiskDistributionRegistry(kvstore)
+    await cached_registry.initialize()
+
+    all_objects = await cached_registry.get_all()
+    assert len(all_objects) == 1
+    assert all_objects[0].identifier == "valid_cached_db"
+
+    invalid_obj = await cached_registry.get("vector_db", "invalid_cached_db")
+    assert invalid_obj is None
--- a/tests/unit/registry/test_registry_acl.py
+++ b/tests/unit/registry/test_registry_acl.py
@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import shutil
+import tempfile
+
+import pytest
+
+from llama_stack.apis.models import ModelType
+from llama_stack.distribution.datatypes import ModelWithACL
+from llama_stack.distribution.server.auth import AccessAttributes
+from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.sqlite import SqliteKVStoreImpl
+
+
+@pytest.fixture(scope="function")
+async def kvstore():
+    temp_dir = tempfile.mkdtemp()
+    db_path = os.path.join(temp_dir, "test_registry_acl.db")
+    kvstore_config = SqliteKVStoreConfig(db_path=db_path)
+    kvstore = SqliteKVStoreImpl(kvstore_config)
+    await kvstore.initialize()
+    yield kvstore
+    shutil.rmtree(temp_dir)
+
+
+@pytest.fixture(scope="function")
+async def registry(kvstore):
+    registry = CachedDiskDistributionRegistry(kvstore)
+    await registry.initialize()
+    return registry
+
+
+@pytest.mark.asyncio
+async def test_registry_cache_with_acl(registry):
+    model = ModelWithACL(
+        identifier="model-acl",
+        provider_id="test-provider",
+        provider_resource_id="model-acl-resource",
+        model_type=ModelType.llm,
+        access_attributes=AccessAttributes(roles=["admin"], teams=["ai-team"]),
+    )
+
+    success = await registry.register(model)
+    assert success
+
+    cached_model = registry.get_cached("model", "model-acl")
+    assert cached_model is not None
+    assert cached_model.identifier == "model-acl"
+    assert cached_model.access_attributes.roles == ["admin"]
+    assert cached_model.access_attributes.teams == ["ai-team"]
+
+    fetched_model = await registry.get("model", "model-acl")
+    assert fetched_model is not None
+    assert fetched_model.identifier == "model-acl"
+    assert fetched_model.access_attributes.roles == ["admin"]
+
+    model.access_attributes = AccessAttributes(roles=["admin", "user"], projects=["project-x"])
+    await registry.update(model)
+
+    updated_cached = registry.get_cached("model", "model-acl")
+    assert updated_cached is not None
+    assert updated_cached.access_attributes.roles == ["admin", "user"]
+    assert updated_cached.access_attributes.projects == ["project-x"]
+    assert updated_cached.access_attributes.teams is None
+
+    new_registry = CachedDiskDistributionRegistry(registry.kvstore)
+    await new_registry.initialize()
+
+    new_model = await new_registry.get("model", "model-acl")
+    assert new_model is not None
+    assert new_model.identifier == "model-acl"
+    assert new_model.access_attributes.roles == ["admin", "user"]
+    assert new_model.access_attributes.projects == ["project-x"]
+    assert new_model.access_attributes.teams is None
+
+
+@pytest.mark.asyncio
+async def test_registry_empty_acl(registry):
+    model = ModelWithACL(
+        identifier="model-empty-acl",
+        provider_id="test-provider",
+        provider_resource_id="model-resource",
+        model_type=ModelType.llm,
+        access_attributes=AccessAttributes(),
+    )
+
+    await registry.register(model)
+
+    cached_model = registry.get_cached("model", "model-empty-acl")
+    assert cached_model is not None
+    assert cached_model.access_attributes is not None
+    assert cached_model.access_attributes.roles is None
+    assert cached_model.access_attributes.teams is None
+    assert cached_model.access_attributes.projects is None
+    assert cached_model.access_attributes.namespaces is None
+
+    all_models = await registry.get_all()
+    assert len(all_models) == 1
+
+    model = ModelWithACL(
+        identifier="model-no-acl",
+        provider_id="test-provider",
+        provider_resource_id="model-resource-2",
+        model_type=ModelType.llm,
+    )
+
+    await registry.register(model)
+
+    cached_model = registry.get_cached("model", "model-no-acl")
+    assert cached_model is not None
+    assert cached_model.access_attributes is None
+
+    all_models = await registry.get_all()
+    assert len(all_models) == 2
+
+
+@pytest.mark.asyncio
+async def test_registry_serialization(registry):
+    attributes = AccessAttributes(
+        roles=["admin", "researcher"],
+        teams=["ai-team", "ml-team"],
+        projects=["project-a", "project-b"],
+        namespaces=["prod", "staging"],
+    )
+
+    model = ModelWithACL(
+        identifier="model-serialize",
+        provider_id="test-provider",
+        provider_resource_id="model-resource",
+        model_type=ModelType.llm,
+        access_attributes=attributes,
+    )
+
+    await registry.register(model)
+
+    new_registry = CachedDiskDistributionRegistry(registry.kvstore)
+    await new_registry.initialize()
+
+    loaded_model = await new_registry.get("model", "model-serialize")
+    assert loaded_model is not None
+
+    assert loaded_model.access_attributes.roles == ["admin", "researcher"]
+    assert loaded_model.access_attributes.teams == ["ai-team", "ml-team"]
+    assert loaded_model.access_attributes.projects == ["project-a", "project-b"]
+    assert loaded_model.access_attributes.namespaces == ["prod", "staging"]
--- a/tests/unit/server/test_access_control.py
+++ b/tests/unit/server/test_access_control.py
@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import shutil
+import tempfile
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+
+from llama_stack.apis.datatypes import Api
+from llama_stack.apis.models import ModelType
+from llama_stack.distribution.datatypes import AccessAttributes, ModelWithACL
+from llama_stack.distribution.routers.routing_tables import ModelsRoutingTable
+from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.sqlite import SqliteKVStoreImpl
+
+
+class AsyncMock(MagicMock):
+    async def __call__(self, *args, **kwargs):
+        return super(AsyncMock, self).__call__(*args, **kwargs)
+
+
+def _return_model(model):
+    return model
+
+
+@pytest.fixture
+async def test_setup():
+    temp_dir = tempfile.mkdtemp()
+    db_path = os.path.join(temp_dir, "test_access_control.db")
+    kvstore_config = SqliteKVStoreConfig(db_path=db_path)
+    kvstore = SqliteKVStoreImpl(kvstore_config)
+    await kvstore.initialize()
+    registry = CachedDiskDistributionRegistry(kvstore)
+    await registry.initialize()
+
+    mock_inference = Mock()
+    mock_inference.__provider_spec__ = MagicMock()
+    mock_inference.__provider_spec__.api = Api.inference
+    mock_inference.register_model = AsyncMock(side_effect=_return_model)
+    routing_table = ModelsRoutingTable(
+        impls_by_provider_id={"test_provider": mock_inference},
+        dist_registry=registry,
+    )
+    yield registry, routing_table
+    shutil.rmtree(temp_dir)
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.distribution.routers.routing_tables.get_auth_attributes")
+async def test_access_control_with_cache(mock_get_auth_attributes, test_setup):
+    registry, routing_table = test_setup
+    model_public = ModelWithACL(
+        identifier="model-public",
+        provider_id="test_provider",
+        provider_resource_id="model-public",
+        model_type=ModelType.llm,
+    )
+    model_admin_only = ModelWithACL(
+        identifier="model-admin",
+        provider_id="test_provider",
+        provider_resource_id="model-admin",
+        model_type=ModelType.llm,
+        access_attributes=AccessAttributes(roles=["admin"]),
+    )
+    model_data_scientist = ModelWithACL(
+        identifier="model-data-scientist",
+        provider_id="test_provider",
+        provider_resource_id="model-data-scientist",
+        model_type=ModelType.llm,
+        access_attributes=AccessAttributes(roles=["data-scientist", "researcher"], teams=["ml-team"]),
+    )
+    await registry.register(model_public)
+    await registry.register(model_admin_only)
+    await registry.register(model_data_scientist)
+
+    mock_get_auth_attributes.return_value = {"roles": ["admin"], "teams": ["management"]}
+    all_models = await routing_table.list_models()
+    assert len(all_models.data) == 2
+
+    model = await routing_table.get_model("model-public")
+    assert model.identifier == "model-public"
+    model = await routing_table.get_model("model-admin")
+    assert model.identifier == "model-admin"
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-data-scientist")
+
+    mock_get_auth_attributes.return_value = {"roles": ["data-scientist"], "teams": ["other-team"]}
+    all_models = await routing_table.list_models()
+    assert len(all_models.data) == 1
+    assert all_models.data[0].identifier == "model-public"
+    model = await routing_table.get_model("model-public")
+    assert model.identifier == "model-public"
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-admin")
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-data-scientist")
+
+    mock_get_auth_attributes.return_value = {"roles": ["data-scientist"], "teams": ["ml-team"]}
+    all_models = await routing_table.list_models()
+    assert len(all_models.data) == 2
+    model_ids = [m.identifier for m in all_models.data]
+    assert "model-public" in model_ids
+    assert "model-data-scientist" in model_ids
+    assert "model-admin" not in model_ids
+    model = await routing_table.get_model("model-public")
+    assert model.identifier == "model-public"
+    model = await routing_table.get_model("model-data-scientist")
+    assert model.identifier == "model-data-scientist"
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-admin")
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.distribution.routers.routing_tables.get_auth_attributes")
+async def test_access_control_and_updates(mock_get_auth_attributes, test_setup):
+    registry, routing_table = test_setup
+    model_public = ModelWithACL(
+        identifier="model-updates",
+        provider_id="test_provider",
+        provider_resource_id="model-updates",
+        model_type=ModelType.llm,
+    )
+    await registry.register(model_public)
+    mock_get_auth_attributes.return_value = {
+        "roles": ["user"],
+    }
+    model = await routing_table.get_model("model-updates")
+    assert model.identifier == "model-updates"
+    model_public.access_attributes = AccessAttributes(roles=["admin"])
+    await registry.update(model_public)
+    mock_get_auth_attributes.return_value = {
+        "roles": ["user"],
+    }
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-updates")
+    mock_get_auth_attributes.return_value = {
+        "roles": ["admin"],
+    }
+    model = await routing_table.get_model("model-updates")
+    assert model.identifier == "model-updates"
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.distribution.routers.routing_tables.get_auth_attributes")
+async def test_access_control_empty_attributes(mock_get_auth_attributes, test_setup):
+    registry, routing_table = test_setup
+    model = ModelWithACL(
+        identifier="model-empty-attrs",
+        provider_id="test_provider",
+        provider_resource_id="model-empty-attrs",
+        model_type=ModelType.llm,
+        access_attributes=AccessAttributes(),
+    )
+    await registry.register(model)
+    mock_get_auth_attributes.return_value = {
+        "roles": [],
+    }
+    result = await routing_table.get_model("model-empty-attrs")
+    assert result.identifier == "model-empty-attrs"
+    all_models = await routing_table.list_models()
+    model_ids = [m.identifier for m in all_models.data]
+    assert "model-empty-attrs" in model_ids
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.distribution.routers.routing_tables.get_auth_attributes")
+async def test_no_user_attributes(mock_get_auth_attributes, test_setup):
+    registry, routing_table = test_setup
+    model_public = ModelWithACL(
+        identifier="model-public-2",
+        provider_id="test_provider",
+        provider_resource_id="model-public-2",
+        model_type=ModelType.llm,
+    )
+    model_restricted = ModelWithACL(
+        identifier="model-restricted",
+        provider_id="test_provider",
+        provider_resource_id="model-restricted",
+        model_type=ModelType.llm,
+        access_attributes=AccessAttributes(roles=["admin"]),
+    )
+    await registry.register(model_public)
+    await registry.register(model_restricted)
+    mock_get_auth_attributes.return_value = None
+    model = await routing_table.get_model("model-public-2")
+    assert model.identifier == "model-public-2"
+
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-restricted")
+
+    all_models = await routing_table.list_models()
+    assert len(all_models.data) == 1
+    assert all_models.data[0].identifier == "model-public-2"
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.distribution.routers.routing_tables.get_auth_attributes")
+async def test_automatic_access_attributes(mock_get_auth_attributes, test_setup):
+    """Test that newly created resources inherit access attributes from their creator."""
+    registry, routing_table = test_setup
+
+    # Set creator's attributes
+    creator_attributes = {"roles": ["data-scientist"], "teams": ["ml-team"], "projects": ["llama-3"]}
+    mock_get_auth_attributes.return_value = creator_attributes
+
+    # Create model without explicit access attributes
+    model = ModelWithACL(
+        identifier="auto-access-model",
+        provider_id="test_provider",
+        provider_resource_id="auto-access-model",
+        model_type=ModelType.llm,
+    )
+    await routing_table.register_object(model)
+
+    # Verify the model got creator's attributes
+    registered_model = await routing_table.get_model("auto-access-model")
+    assert registered_model.access_attributes is not None
+    assert registered_model.access_attributes.roles == ["data-scientist"]
+    assert registered_model.access_attributes.teams == ["ml-team"]
+    assert registered_model.access_attributes.projects == ["llama-3"]
+
+    # Verify another user without matching attributes can't access it
+    mock_get_auth_attributes.return_value = {"roles": ["engineer"], "teams": ["infra-team"]}
+    with pytest.raises(ValueError):
+        await routing_table.get_model("auto-access-model")
+
+    # But a user with matching attributes can
+    mock_get_auth_attributes.return_value = {
+        "roles": ["data-scientist", "engineer"],
+        "teams": ["ml-team", "platform-team"],
+        "projects": ["llama-3"],
+    }
+    model = await routing_table.get_model("auto-access-model")
+    assert model.identifier == "auto-access-model"
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@ -0,0 +1,206 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from llama_stack.distribution.server.auth import AuthenticationMiddleware
+
+
+class MockResponse:
+    def __init__(self, status_code, json_data):
+        self.status_code = status_code
+        self._json_data = json_data
+
+    def json(self):
+        return self._json_data
+
+
+@pytest.fixture
+def mock_auth_endpoint():
+    return "http://mock-auth-service/validate"
+
+
+@pytest.fixture
+def valid_api_key():
+    return "valid_api_key_12345"
+
+
+@pytest.fixture
+def invalid_api_key():
+    return "invalid_api_key_67890"
+
+
+@pytest.fixture
+def app(mock_auth_endpoint):
+    app = FastAPI()
+    app.add_middleware(AuthenticationMiddleware, auth_endpoint=mock_auth_endpoint)
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "Authentication successful"}
+
+    return app
+
+
+@pytest.fixture
+def client(app):
+    return TestClient(app)
+
+
+@pytest.fixture
+def mock_scope():
+    return {
+        "type": "http",
+        "path": "/models/list",
+        "headers": [
+            (b"content-type", b"application/json"),
+            (b"authorization", b"Bearer test-api-key"),
+            (b"user-agent", b"test-user-agent"),
+        ],
+        "query_string": b"limit=100&offset=0",
+    }
+
+
+@pytest.fixture
+def mock_middleware(mock_auth_endpoint):
+    mock_app = AsyncMock()
+    return AuthenticationMiddleware(mock_app, mock_auth_endpoint), mock_app
+
+
+async def mock_post_success(*args, **kwargs):
+    return MockResponse(200, {"message": "Authentication successful"})
+
+
+async def mock_post_failure(*args, **kwargs):
+    return MockResponse(401, {"message": "Authentication failed"})
+
+
+async def mock_post_exception(*args, **kwargs):
+    raise Exception("Connection error")
+
+
+def test_missing_auth_header(client):
+    response = client.get("/test")
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+def test_invalid_auth_header_format(client):
+    response = client.get("/test", headers={"Authorization": "InvalidFormat token123"})
+    assert response.status_code == 401
+    assert "Missing or invalid Authorization header" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_post_success)
+def test_valid_authentication(client, valid_api_key):
+    response = client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
+    assert response.status_code == 200
+    assert response.json() == {"message": "Authentication successful"}
+
+
+@patch("httpx.AsyncClient.post", new=mock_post_failure)
+def test_invalid_authentication(client, invalid_api_key):
+    response = client.get("/test", headers={"Authorization": f"Bearer {invalid_api_key}"})
+    assert response.status_code == 401
+    assert "Authentication failed" in response.json()["error"]["message"]
+
+
+@patch("httpx.AsyncClient.post", new=mock_post_exception)
+def test_auth_service_error(client, valid_api_key):
+    response = client.get("/test", headers={"Authorization": f"Bearer {valid_api_key}"})
+    assert response.status_code == 401
+    assert "Authentication service error" in response.json()["error"]["message"]
+
+
+def test_auth_request_payload(client, valid_api_key, mock_auth_endpoint):
+    with patch("httpx.AsyncClient.post") as mock_post:
+        mock_response = MockResponse(200, {"message": "Authentication successful"})
+        mock_post.return_value = mock_response
+
+        client.get(
+            "/test?param1=value1&param2=value2",
+            headers={
+                "Authorization": f"Bearer {valid_api_key}",
+                "User-Agent": "TestClient",
+                "Content-Type": "application/json",
+            },
+        )
+
+        # Check that the auth endpoint was called with the correct payload
+        call_args = mock_post.call_args
+        assert call_args is not None
+
+        url, kwargs = call_args[0][0], call_args[1]
+        assert url == mock_auth_endpoint
+
+        payload = kwargs["json"]
+        assert payload["api_key"] == valid_api_key
+        assert payload["request"]["path"] == "/test"
+        assert "authorization" not in payload["request"]["headers"]
+        assert "param1" in payload["request"]["params"]
+        assert "param2" in payload["request"]["params"]
+
+
+@pytest.mark.asyncio
+async def test_auth_middleware_with_access_attributes(mock_middleware, mock_scope):
+    middleware, mock_app = mock_middleware
+    mock_receive = AsyncMock()
+    mock_send = AsyncMock()
+
+    with patch("httpx.AsyncClient") as mock_client:
+        mock_client_instance = AsyncMock()
+        mock_client.return_value.__aenter__.return_value = mock_client_instance
+
+        mock_client_instance.post.return_value = MockResponse(
+            200,
+            {
+                "access_attributes": {
+                    "roles": ["admin", "user"],
+                    "teams": ["ml-team"],
+                    "projects": ["project-x", "project-y"],
+                }
+            },
+        )
+
+        await middleware(mock_scope, mock_receive, mock_send)
+
+        assert "user_attributes" in mock_scope
+        assert mock_scope["user_attributes"]["roles"] == ["admin", "user"]
+        assert mock_scope["user_attributes"]["teams"] == ["ml-team"]
+        assert mock_scope["user_attributes"]["projects"] == ["project-x", "project-y"]
+
+        mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
+
+
+@pytest.mark.asyncio
+async def test_auth_middleware_no_attributes(mock_middleware, mock_scope):
+    """Test middleware behavior with no access attributes"""
+    middleware, mock_app = mock_middleware
+    mock_receive = AsyncMock()
+    mock_send = AsyncMock()
+
+    with patch("httpx.AsyncClient") as mock_client:
+        mock_client_instance = AsyncMock()
+        mock_client.return_value.__aenter__.return_value = mock_client_instance
+
+        mock_client_instance.post.return_value = MockResponse(
+            200,
+            {
+                "message": "Authentication successful"
+                # No access_attributes
+            },
+        )
+
+        await middleware(mock_scope, mock_receive, mock_send)
+
+        assert "user_attributes" in mock_scope
+        attributes = mock_scope["user_attributes"]
+        assert "namespaces" in attributes
+        assert attributes["namespaces"] == ["test-api-key"]