Merge branch 'main' into out-of-token-budget-fix

2025-12-29 16:14:45 +00:00 · 2025-05-14 12:46:24 +02:00 · 2025-05-14 12:46:24 +02:00 · 85ef55391d
commit 85ef55391d
parent 63cce5673a 43d4447ff0
61 changed files with 1322 additions and 1598 deletions
--- a/tests/README.md
+++ b/tests/README.md
@ -0,0 +1,9 @@
+# Llama Stack Tests
+
+Llama Stack has multiple layers of testing done to ensure continuous functionality and prevent regressions to the codebase.
+
+| Testing Type | Details |
+|--------------|---------|
+| Unit | [unit/README.md](unit/README.md) |
+| Integration | [integration/README.md](integration/README.md) |
+| Verification | [verifications/README.md](verifications/README.md) |
--- a/tests/external-provider/llama-stack-provider-ollama/run.yaml
+++ b/tests/external-provider/llama-stack-provider-ollama/run.yaml
@ -53,9 +53,6 @@ providers:
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:}
      max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
@ -90,8 +87,6 @@ tool_groups:
  provider_id: tavily-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
  provider_id: wolfram-alpha
 server:
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -11,7 +11,7 @@ pytest --help
 Here are the most important options:
 - `--stack-config`: specify the stack config to use. You have three ways to point to a stack:
  - a URL which points to a Llama Stack distribution server
-  - a template (e.g., `fireworks`, `together`) or a path to a run.yaml file
+  - a template (e.g., `fireworks`, `together`) or a path to a `run.yaml` file
  - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
 - `--env`: set environment variables, e.g. --env KEY=value. this is a utility option to set environment variables required by various providers.

@ -28,7 +28,6 @@ if no model is specified.

 Experimental, under development, options:
 - `--record-responses`: record new API responses instead of using cached ones
- `--report`: path where the test report should be written, e.g. --report=/path/to/report.md


 ## Examples
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -15,8 +15,6 @@ from dotenv import load_dotenv

 from llama_stack.log import get_logger

-from .report import Report
-
 logger = get_logger(__name__, category="tests")


@ -60,9 +58,6 @@ def pytest_configure(config):
        os.environ["DISABLE_CODE_SANDBOX"] = "1"
        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")

-    if config.getoption("--report"):
-        config.pluginmanager.register(Report(config))
-

 def pytest_addoption(parser):
    parser.addoption(
--- a/tests/integration/metadata.py
+++ b/tests/integration/metadata.py
@ -1,54 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.datatypes import Api
-
-INFERENCE_API_CAPA_TEST_MAP = {
-    "chat_completion": {
-        "streaming": [
-            "test_text_chat_completion_streaming",
-            "test_image_chat_completion_streaming",
-        ],
-        "non_streaming": [
-            "test_image_chat_completion_non_streaming",
-            "test_text_chat_completion_non_streaming",
-        ],
-        "tool_calling": [
-            "test_text_chat_completion_with_tool_calling_and_streaming",
-            "test_text_chat_completion_with_tool_calling_and_non_streaming",
-        ],
-        "log_probs": [
-            "test_completion_log_probs_non_streaming",
-            "test_completion_log_probs_streaming",
-        ],
-    },
-    "completion": {
-        "streaming": ["test_text_completion_streaming"],
-        "non_streaming": ["test_text_completion_non_streaming"],
-        "structured_output": ["test_text_completion_structured_output"],
-    },
-}
-
-VECTORIO_API_TEST_MAP = {
-    "retrieve": {
-        "": ["test_vector_db_retrieve"],
-    }
-}
-
-AGENTS_API_TEST_MAP = {
-    "create_agent_turn": {
-        "rag": ["test_rag_agent"],
-        "custom_tool": ["test_custom_tool"],
-        "code_execution": ["test_code_interpreter_for_attachments"],
-    }
-}
-
-
-API_MAPS = {
-    Api.inference: INFERENCE_API_CAPA_TEST_MAP,
-    Api.vector_io: VECTORIO_API_TEST_MAP,
-    Api.agents: AGENTS_API_TEST_MAP,
-}
--- a/tests/integration/report.py
+++ b/tests/integration/report.py
@ -1,216 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from collections import defaultdict
-
-import pytest
-from pytest import CollectReport
-from termcolor import cprint
-
-from llama_stack.models.llama.sku_list import (
-    all_registered_models,
-    llama3_1_instruct_models,
-    llama3_2_instruct_models,
-    llama3_3_instruct_models,
-    llama3_instruct_models,
-    safety_models,
-)
-from llama_stack.models.llama.sku_types import CoreModelId
-from llama_stack.providers.datatypes import Api
-
-from .metadata import API_MAPS
-
-
-def featured_models():
-    models = [
-        *llama3_instruct_models(),
-        *llama3_1_instruct_models(),
-        *llama3_2_instruct_models(),
-        *llama3_3_instruct_models(),
-        *safety_models(),
-    ]
-    return {model.huggingface_repo: model for model in models if not model.variant}
-
-
-SUPPORTED_MODELS = {
-    "ollama": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_1b.value,
-    },
-    "tgi": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
-    "vllm": {model.core_model_id.value for model in all_registered_models() if model.huggingface_repo},
-}
-
-
-class Report:
-    def __init__(self, config):
-        self.distro_name = None
-        self.config = config
-
-        stack_config = self.config.getoption("--stack-config")
-        if stack_config:
-            is_url = stack_config.startswith("http") or "//" in stack_config
-            is_yaml = stack_config.endswith(".yaml")
-            if not is_url and not is_yaml:
-                self.distro_name = stack_config
-
-        self.report_data = defaultdict(dict)
-        # test function -> test nodeid
-        self.test_data = dict()
-        self.test_name_to_nodeid = defaultdict(list)
-        self.vision_model_id = None
-        self.text_model_id = None
-        self.client = None
-
-    @pytest.hookimpl(tryfirst=True)
-    def pytest_runtest_logreport(self, report):
-        # This hook is called in several phases, including setup, call and teardown
-        # The test is considered failed / error if any of the outcomes is not "Passed"
-        outcome = self._process_outcome(report)
-        if report.nodeid not in self.test_data:
-            self.test_data[report.nodeid] = outcome
-        elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
-            self.test_data[report.nodeid] = outcome
-
-    def pytest_sessionfinish(self, session):
-        if not self.client:
-            return
-
-        report = []
-        report.append(f"# Report for {self.distro_name} distribution")
-        report.append("\n## Supported Models")
-
-        header = f"| Model Descriptor | {self.distro_name} |"
-        dividor = "|:---|:---|"
-
-        report.append(header)
-        report.append(dividor)
-
-        rows = []
-        if self.distro_name in SUPPORTED_MODELS:
-            for model in all_registered_models():
-                if ("Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value) or (
-                    model.variant
-                ):
-                    continue
-                row = f"| {model.core_model_id.value} |"
-                if model.core_model_id.value in SUPPORTED_MODELS[self.distro_name]:
-                    row += " ✅ |"
-                else:
-                    row += " ❌ |"
-                rows.append(row)
-        else:
-            supported_models = {m.identifier for m in self.client.models.list()}
-            for hf_name, model in featured_models().items():
-                row = f"| {model.core_model_id.value} |"
-                if hf_name in supported_models:
-                    row += " ✅ |"
-                else:
-                    row += " ❌ |"
-                rows.append(row)
-        report.extend(rows)
-
-        report.append("\n## Inference")
-        test_table = [
-            "| Model | API | Capability | Test | Status |",
-            "|:----- |:-----|:-----|:-----|:-----|",
-        ]
-        for api, capa_map in API_MAPS[Api.inference].items():
-            for capa, tests in capa_map.items():
-                for test_name in tests:
-                    model_id = self.text_model_id if "text" in test_name else self.vision_model_id
-                    test_nodeids = self.test_name_to_nodeid[test_name]
-                    if not test_nodeids:
-                        continue
-
-                    # There might be more than one parametrizations for the same test function. We take
-                    # the result of the first one for now. Ideally we should mark the test as failed if
-                    # any of the parametrizations failed.
-                    test_table.append(
-                        f"| {model_id} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |"
-                    )
-
-        report.extend(test_table)
-
-        name_map = {Api.vector_io: "Vector IO", Api.agents: "Agents"}
-        providers = self.client.providers.list()
-        for api_group in [Api.vector_io, Api.agents]:
-            api_capitalized = name_map[api_group]
-            report.append(f"\n## {api_capitalized}")
-            test_table = [
-                "| Provider | API | Capability | Test | Status |",
-                "|:-----|:-----|:-----|:-----|:-----|",
-            ]
-            provider = [p for p in providers if p.api == str(api_group.name)]
-            provider_str = ",".join(provider) if provider else ""
-            for api, capa_map in API_MAPS[api_group].items():
-                for capa, tests in capa_map.items():
-                    for test_name in tests:
-                        test_nodeids = self.test_name_to_nodeid[test_name]
-                        if not test_nodeids:
-                            continue
-                        test_table.append(
-                            f"| {provider_str} | /{api} | {capa} | {test_name} | {self._print_result_icon(self.test_data[test_nodeids[0]])} |"
-                        )
-            report.extend(test_table)
-
-        output_file = self.output_path
-        text = "\n".join(report) + "\n"
-        output_file.write_text(text)
-        cprint(f"\nReport generated: {output_file.absolute()}", "green")
-
-    def pytest_runtest_makereport(self, item, call):
-        func_name = getattr(item, "originalname", item.name)
-        self.test_name_to_nodeid[func_name].append(item.nodeid)
-
-        # Get values from fixtures for report output
-        if model_id := item.funcargs.get("text_model_id"):
-            text_model = model_id.split("/")[1]
-            self.text_model_id = self.text_model_id or text_model
-        elif model_id := item.funcargs.get("vision_model_id"):
-            vision_model = model_id.split("/")[1]
-            self.vision_model_id = self.vision_model_id or vision_model
-
-        if not self.client:
-            self.client = item.funcargs.get("llama_stack_client")
-
-    def _print_result_icon(self, result):
-        if result == "Passed":
-            return "✅"
-        elif result == "Failed" or result == "Error":
-            return "❌"
-        else:
-            #  result == "Skipped":
-            return "⏭️"
-
-    def _process_outcome(self, report: CollectReport):
-        if self._is_error(report):
-            return "Error"
-        if hasattr(report, "wasxfail"):
-            if report.outcome in ["passed", "failed"]:
-                return "XPassed"
-            if report.outcome == "skipped":
-                return "XFailed"
-        return report.outcome.capitalize()
-
-    def _is_error(self, report: CollectReport):
-        return report.when in ["setup", "teardown", "collect"] and report.outcome == "failed"
--- a/tests/unit/README.md
+++ b/tests/unit/README.md
@ -0,0 +1,21 @@
+# Llama Stack Unit Tests
+
+You can run the unit tests by running:
+
+```bash
+source .venv/bin/activate
+./scripts/unit-tests.sh [PYTEST_ARGS]
+```
+
+Any additional arguments are passed to pytest. For example, you can specify a test directory, a specific test file, or any pytest flags (e.g., -vvv for verbosity). If no test directory is specified, it defaults to "tests/unit", e.g:
+
+```bash
+./scripts/unit-tests.sh tests/unit/registry/test_registry.py -vvv
+```
+
+If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
+
+```
+source .venv/bin/activate
+PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
+```
--- a/tests/unit/providers/agents/meta_reference/fixtures/init.py
+++ b/tests/unit/providers/agents/meta_reference/fixtures/init.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+import yaml
+
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+)
+
+FIXTURES_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def load_chat_completion_fixture(filename: str) -> OpenAIChatCompletion:
+    fixture_path = os.path.join(FIXTURES_DIR, filename)
+
+    with open(fixture_path) as f:
+        data = yaml.safe_load(f)
+    return OpenAIChatCompletion(**data)
--- a/tests/unit/providers/agents/meta_reference/fixtures/simple_chat_completion.yaml
+++ b/tests/unit/providers/agents/meta_reference/fixtures/simple_chat_completion.yaml
@ -0,0 +1,9 @@
+id: chat-completion-123
+choices:
+  - message:
+      content: "Dublin"
+      role: assistant
+    finish_reason: stop
+    index: 0
+created: 1234567890
+model: meta-llama/Llama-3.1-8B-Instruct
--- a/tests/unit/providers/agents/meta_reference/fixtures/tool_call_completion.yaml
+++ b/tests/unit/providers/agents/meta_reference/fixtures/tool_call_completion.yaml
@ -0,0 +1,14 @@
+id: chat-completion-123
+choices:
+  - message:
+      tool_calls:
+        - id: tool_call_123
+          type: function
+          function:
+            name: web_search
+            arguments: '{"query":"What is the capital of Ireland?"}'
+      role: assistant
+    finish_reason: stop
+    index: 0
+created: 1234567890
+model: meta-llama/Llama-3.1-8B-Instruct
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@ -4,27 +4,32 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from unittest.mock import AsyncMock
+from unittest.mock import AsyncMock, patch

 import pytest

 from llama_stack.apis.agents.openai_responses import (
+    OpenAIResponseInputItemList,
+    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputToolWebSearch,
-    OpenAIResponseOutputMessage,
+    OpenAIResponseMessage,
+    OpenAIResponseObject,
+    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.inference.inference import (
    OpenAIAssistantMessageParam,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionToolCall,
-    OpenAIChatCompletionToolCallFunction,
-    OpenAIChoice,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIDeveloperMessageParam,
    OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.providers.inline.agents.meta_reference.openai_responses import (
+    OpenAIResponsePreviousResponseWithInputItems,
    OpenAIResponsesImpl,
 )
 from llama_stack.providers.utils.kvstore import KVStore
+from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture


@pytest.fixture
@ -65,21 +70,11 @@ def openai_responses_impl(mock_kvstore, mock_inference_api, mock_tool_groups_api
 async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
    """Test creating an OpenAI response with a simple string input."""
    # Setup
-    input_text = "Hello, world!"
+    input_text = "What is the capital of Ireland?"
    model = "meta-llama/Llama-3.1-8B-Instruct"

-    mock_chat_completion = OpenAIChatCompletion(
-        id="chat-completion-123",
-        choices=[
-            OpenAIChoice(
-                message=OpenAIAssistantMessageParam(content="Hello! How can I help you?"),
-                finish_reason="stop",
-                index=0,
-            )
-        ],
-        created=1234567890,
-        model=model,
-    )
+    # Load the chat completion fixture
+    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion

    # Execute
@ -92,7 +87,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
    # Verify
    mock_inference_api.openai_chat_completion.assert_called_once_with(
        model=model,
-        messages=[OpenAIUserMessageParam(role="user", content="Hello, world!", name=None)],
+        messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
        tools=None,
        stream=False,
        temperature=0.1,
@ -100,55 +95,25 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
    openai_responses_impl.persistence_store.set.assert_called_once()
    assert result.model == model
    assert len(result.output) == 1
-    assert isinstance(result.output[0], OpenAIResponseOutputMessage)
-    assert result.output[0].content[0].text == "Hello! How can I help you?"
+    assert isinstance(result.output[0], OpenAIResponseMessage)
+    assert result.output[0].content[0].text == "Dublin"


@pytest.mark.asyncio
 async def test_create_openai_response_with_string_input_with_tools(openai_responses_impl, mock_inference_api):
    """Test creating an OpenAI response with a simple string input and tools."""
    # Setup
-    input_text = "What was the score of todays game?"
+    input_text = "What is the capital of Ireland?"
    model = "meta-llama/Llama-3.1-8B-Instruct"

-    mock_chat_completions = [
-        OpenAIChatCompletion(
-            id="chat-completion-123",
-            choices=[
-                OpenAIChoice(
-                    message=OpenAIAssistantMessageParam(
-                        tool_calls=[
-                            OpenAIChatCompletionToolCall(
-                                id="tool_call_123",
-                                type="function",
-                                function=OpenAIChatCompletionToolCallFunction(
-                                    name="web_search", arguments='{"query":"What was the score of todays game?"}'
-                                ),
-                            )
-                        ],
-                    ),
-                    finish_reason="stop",
-                    index=0,
-                )
-            ],
-            created=1234567890,
-            model=model,
-        ),
-        OpenAIChatCompletion(
-            id="chat-completion-123",
-            choices=[
-                OpenAIChoice(
-                    message=OpenAIAssistantMessageParam(content="The score of todays game was 10-12"),
-                    finish_reason="stop",
-                    index=0,
-                )
-            ],
-            created=1234567890,
-            model=model,
-        ),
-    ]
+    # Load the chat completion fixtures
+    tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml")
+    tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml")

-    mock_inference_api.openai_chat_completion.side_effect = mock_chat_completions
+    mock_inference_api.openai_chat_completion.side_effect = [
+        tool_call_completion,
+        tool_response_completion,
+    ]

    openai_responses_impl.tool_groups_api.get_tool.return_value = Tool(
        identifier="web_search",
@ -163,7 +128,7 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon

    openai_responses_impl.tool_runtime_api.invoke_tool.return_value = ToolInvocationResult(
        status="completed",
-        content="The score of todays game was 10-12",
+        content="Dublin",
    )

    # Execute
@ -180,23 +145,172 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon

    # Verify
    first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
-    assert first_call.kwargs["messages"][0].content == "What was the score of todays game?"
+    assert first_call.kwargs["messages"][0].content == "What is the capital of Ireland?"
    assert first_call.kwargs["tools"] is not None
    assert first_call.kwargs["temperature"] == 0.1

    second_call = mock_inference_api.openai_chat_completion.call_args_list[1]
-    assert second_call.kwargs["messages"][-1].content == "The score of todays game was 10-12"
+    assert second_call.kwargs["messages"][-1].content == "Dublin"
    assert second_call.kwargs["temperature"] == 0.1

    openai_responses_impl.tool_groups_api.get_tool.assert_called_once_with("web_search")
    openai_responses_impl.tool_runtime_api.invoke_tool.assert_called_once_with(
        tool_name="web_search",
-        kwargs={"query": "What was the score of todays game?"},
+        kwargs={"query": "What is the capital of Ireland?"},
    )

    openai_responses_impl.persistence_store.set.assert_called_once()

    # Check that we got the content from our mocked tool execution result
    assert len(result.output) >= 1
-    assert isinstance(result.output[1], OpenAIResponseOutputMessage)
-    assert result.output[1].content[0].text == "The score of todays game was 10-12"
+    assert isinstance(result.output[1], OpenAIResponseMessage)
+    assert result.output[1].content[0].text == "Dublin"
+
+
+@pytest.mark.asyncio
+async def test_create_openai_response_with_multiple_messages(openai_responses_impl, mock_inference_api):
+    """Test creating an OpenAI response with multiple messages."""
+    # Setup
+    input_messages = [
+        OpenAIResponseMessage(role="developer", content="You are a helpful assistant", name=None),
+        OpenAIResponseMessage(role="user", content="Name some towns in Ireland", name=None),
+        OpenAIResponseMessage(
+            role="assistant",
+            content=[
+                OpenAIResponseInputMessageContentText(text="Galway, Longford, Sligo"),
+                OpenAIResponseInputMessageContentText(text="Dublin"),
+            ],
+            name=None,
+        ),
+        OpenAIResponseMessage(role="user", content="Which is the largest town in Ireland?", name=None),
+    ]
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+
+    mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml")
+
+    # Execute
+    await openai_responses_impl.create_openai_response(
+        input=input_messages,
+        model=model,
+        temperature=0.1,
+    )
+
+    # Verify the the correct messages were sent to the inference API i.e.
+    # All of the responses message were convered to the chat completion message objects
+    inference_messages = mock_inference_api.openai_chat_completion.call_args_list[0].kwargs["messages"]
+    for i, m in enumerate(input_messages):
+        if isinstance(m.content, str):
+            assert inference_messages[i].content == m.content
+        else:
+            assert inference_messages[i].content[0].text == m.content[0].text
+            assert isinstance(inference_messages[i].content[0], OpenAIChatCompletionContentPartTextParam)
+        assert inference_messages[i].role == m.role
+        if m.role == "user":
+            assert isinstance(inference_messages[i], OpenAIUserMessageParam)
+        elif m.role == "assistant":
+            assert isinstance(inference_messages[i], OpenAIAssistantMessageParam)
+        else:
+            assert isinstance(inference_messages[i], OpenAIDeveloperMessageParam)
+
+
+@pytest.mark.asyncio
+async def test_prepend_previous_response_none(openai_responses_impl):
+    """Test prepending no previous response to a new response."""
+
+    input = await openai_responses_impl._prepend_previous_response("fake_input", None)
+    assert input == "fake_input"
+
+
+@pytest.mark.asyncio
+@patch.object(OpenAIResponsesImpl, "_get_previous_response_with_input")
+async def test_prepend_previous_response_basic(get_previous_response_with_input, openai_responses_impl):
+    """Test prepending a basic previous response to a new response."""
+
+    input_item_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseInputMessageContentText(text="fake_previous_input")],
+        role="user",
+    )
+    input_items = OpenAIResponseInputItemList(data=[input_item_message])
+    response_output_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseOutputMessageContentOutputText(text="fake_response")],
+        status="completed",
+        role="assistant",
+    )
+    response = OpenAIResponseObject(
+        created_at=1,
+        id="resp_123",
+        model="fake_model",
+        output=[response_output_message],
+        status="completed",
+    )
+    previous_response = OpenAIResponsePreviousResponseWithInputItems(
+        input_items=input_items,
+        response=response,
+    )
+    get_previous_response_with_input.return_value = previous_response
+
+    input = await openai_responses_impl._prepend_previous_response("fake_input", "resp_123")
+
+    assert len(input) == 3
+    # Check for previous input
+    assert isinstance(input[0], OpenAIResponseMessage)
+    assert input[0].content[0].text == "fake_previous_input"
+    # Check for previous output
+    assert isinstance(input[1], OpenAIResponseMessage)
+    assert input[1].content[0].text == "fake_response"
+    # Check for new input
+    assert isinstance(input[2], OpenAIResponseMessage)
+    assert input[2].content == "fake_input"
+
+
+@pytest.mark.asyncio
+@patch.object(OpenAIResponsesImpl, "_get_previous_response_with_input")
+async def test_prepend_previous_response_web_search(get_previous_response_with_input, openai_responses_impl):
+    """Test prepending a web search previous response to a new response."""
+
+    input_item_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseInputMessageContentText(text="fake_previous_input")],
+        role="user",
+    )
+    input_items = OpenAIResponseInputItemList(data=[input_item_message])
+    output_web_search = OpenAIResponseOutputMessageWebSearchToolCall(
+        id="ws_123",
+        status="completed",
+    )
+    output_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseOutputMessageContentOutputText(text="fake_web_search_response")],
+        status="completed",
+        role="assistant",
+    )
+    response = OpenAIResponseObject(
+        created_at=1,
+        id="resp_123",
+        model="fake_model",
+        output=[output_web_search, output_message],
+        status="completed",
+    )
+    previous_response = OpenAIResponsePreviousResponseWithInputItems(
+        input_items=input_items,
+        response=response,
+    )
+    get_previous_response_with_input.return_value = previous_response
+
+    input_messages = [OpenAIResponseMessage(content="fake_input", role="user")]
+    input = await openai_responses_impl._prepend_previous_response(input_messages, "resp_123")
+
+    assert len(input) == 4
+    # Check for previous input
+    assert isinstance(input[0], OpenAIResponseMessage)
+    assert input[0].content[0].text == "fake_previous_input"
+    # Check for previous output web search tool call
+    assert isinstance(input[1], OpenAIResponseOutputMessageWebSearchToolCall)
+    # Check for previous output web search response
+    assert isinstance(input[2], OpenAIResponseMessage)
+    assert input[2].content[0].text == "fake_web_search_response"
+    # Check for new input
+    assert isinstance(input[3], OpenAIResponseMessage)
+    assert input[3].content == "fake_input"
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
+
+
+class TestRagQuery:
+    @pytest.mark.asyncio
+    async def test_query_raises_on_empty_vector_db_ids(self):
+        rag_tool = MemoryToolRuntimeImpl(config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock())
+        with pytest.raises(ValueError):
+            await rag_tool.query(content=MagicMock(), vector_db_ids=[])
--- a/tests/verifications/README.md
+++ b/tests/verifications/README.md
@ -4,7 +4,7 @@ Llama Stack Verifications provide standardized test suites to ensure API compati

 ## Overview

-This framework allows you to run the same set of verification tests against different LLM providers'  OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.
+This framework allows you to run the same set of verification tests against different LLM providers' OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.

 ## Features

--- a/tests/verifications/openai-api-verification-run.yaml
+++ b/tests/verifications/openai-api-verification-run.yaml
@ -74,9 +74,6 @@ providers:
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:}
      max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
@ -156,8 +153,6 @@ tool_groups:
  provider_id: tavily-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
  provider_id: wolfram-alpha
 server:
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@ -31,6 +31,26 @@ test_response_web_search:
        search_context_size: "low"
      output: "128"

+test_response_custom_tool:
+  test_name: test_response_custom_tool
+  test_params:
+    case:
+    - case_id: "sf_weather"
+      input: "What's the weather like in San Francisco?"
+      tools:
+      - type: function
+        name: get_weather
+        description: Get current temperature for a given location.
+        parameters:
+          additionalProperties: false
+          properties:
+            location:
+              description: "City and country e.g. Bogot\xE1, Colombia"
+              type: string
+          required:
+          - location
+          type: object
+
 test_response_image:
  test_name: test_response_image
  test_params:
@ -59,7 +79,7 @@ test_response_multi_turn_image:
          - type: input_image
            image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
        output: "llama"
-      - input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick' and 'scout'."
+      - input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick', 'scout' and 'llm'"
        tools:
        - type: web_search
        output: "model"
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@ -124,6 +124,28 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
    assert case["output"].lower() in response.output_text.lower().strip()


+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_custom_tool"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_custom_tool(request, openai_client, model, provider, verification_config, case):
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        tools=case["tools"],
+        stream=False,
+    )
+    assert len(response.output) == 1
+    assert response.output[0].type == "function_call"
+    assert response.output[0].status == "completed"
+    assert response.output[0].name == "get_weather"
+
+
@pytest.mark.parametrize(
    "case",
    responses_test_cases["test_response_image"]["test_params"]["case"],