Merge branch 'main' into kill_configure

2025-10-14 22:33:48 +00:00 · 2024-11-05 10:15:01 -08:00 · 2024-11-05 10:15:01 -08:00 · 47d91b10fb
commit 47d91b10fb
parent adf5eeafb6 8de845a96d
29 changed files with 119 additions and 1463 deletions
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -13,14 +13,22 @@ apis:
 - safety
 providers:
  inference:
-  - provider_id: meta0
+  - provider_id: meta-reference-inference
    provider_type: meta-reference
    config:
-      model: Llama3.1-8B-Instruct
+      model: Llama3.2-3B-Instruct
      quantization: null
      torch_seed: null
      max_seq_len: 4096
      max_batch_size: 1
+  - provider_id: meta-reference-safety
+    provider_type: meta-reference
+    config:
+      model: Llama-Guard-3-1B
+      quantization: null
+      torch_seed: null
+      max_seq_len: 2048
+      max_batch_size: 1
  safety:
  - provider_id: meta0
    provider_type: meta-reference
@ -28,10 +36,9 @@ providers:
      llama_guard_shield:
        model: Llama-Guard-3-1B
        excluded_categories: []
-        disable_input_check: false
-        disable_output_check: false
-      prompt_guard_shield:
-        model: Prompt-Guard-86M
+# Uncomment to use prompt guard
+#      prompt_guard_shield:
+#        model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: meta-reference
@ -52,7 +59,7 @@ providers:
      persistence_store:
        namespace: null
        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
+        db_path: ~/.llama/runtime/agents_store.db
  telemetry:
  - provider_id: meta0
    provider_type: meta-reference
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -23,7 +23,7 @@ class ShieldDef(BaseModel):
    identifier: str = Field(
        description="A unique identifier for the shield type",
    )
-    type: str = Field(
+    shield_type: str = Field(
        description="The type of shield this is; the value is one of the ShieldType enum"
    )
    params: Dict[str, Any] = Field(
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -25,6 +25,7 @@ from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
 SERVER_DEPENDENCIES = [
+    "aiosqlite",
    "fastapi",
    "fire",
    "httpx",
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
@ -83,6 +83,7 @@ def create_api_client_class(protocol, additional_protocol) -> Type:
                j = response.json()
                if j is None:
                    return None
+                # print(f"({protocol.__name__}) Returning {j}, type {return_type}")
                return parse_obj_as(return_type, j)

        async def _call_streaming(self, method_name: str, *args, **kwargs) -> Any:
@ -102,14 +103,15 @@ def create_api_client_class(protocol, additional_protocol) -> Type:
                        if line.startswith("data:"):
                            data = line[len("data: ") :]
                            try:
+                                data = json.loads(data)
                                if "error" in data:
                                    cprint(data, "red")
                                    continue

-                                yield parse_obj_as(return_type, json.loads(data))
+                                yield parse_obj_as(return_type, data)
                            except Exception as e:
-                                print(data)
                                print(f"Error with parsing or validation: {e}")
+                                print(data)

        def httpx_request_params(self, method_name: str, *args, **kwargs) -> dict:
            webmethod, sig = self.routes[method_name]
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -178,16 +178,17 @@ class CommonRoutingTableImpl(RoutingTable):
        await register_object_with_provider(obj, p)
        await self.dist_registry.register(obj)

+    async def get_all_with_type(self, type: str) -> List[RoutableObjectWithProvider]:
+        objs = await self.dist_registry.get_all()
+        return [obj for obj in objs if obj.type == type]
+

 class ModelsRoutingTable(CommonRoutingTableImpl, Models):
    async def list_models(self) -> List[ModelDefWithProvider]:
-        objects = []
-        for objs in self.registry.values():
-            objects.extend(objs)
-        return objects
+        return await self.get_all_with_type("model")

    async def get_model(self, identifier: str) -> Optional[ModelDefWithProvider]:
-        return self.get_object_by_identifier(identifier)
+        return await self.get_object_by_identifier(identifier)

    async def register_model(self, model: ModelDefWithProvider) -> None:
        await self.register_object(model)
@ -195,13 +196,10 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):

 class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
    async def list_shields(self) -> List[ShieldDef]:
-        objects = []
-        for objs in self.registry.values():
-            objects.extend(objs)
-        return objects
+        return await self.get_all_with_type("shield")

    async def get_shield(self, shield_type: str) -> Optional[ShieldDefWithProvider]:
-        return self.get_object_by_identifier(shield_type)
+        return await self.get_object_by_identifier(shield_type)

    async def register_shield(self, shield: ShieldDefWithProvider) -> None:
        await self.register_object(shield)
@ -209,15 +207,12 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):

 class MemoryBanksRoutingTable(CommonRoutingTableImpl, MemoryBanks):
    async def list_memory_banks(self) -> List[MemoryBankDefWithProvider]:
-        objects = []
-        for objs in self.registry.values():
-            objects.extend(objs)
-        return objects
+        return await self.get_all_with_type("memory_bank")

    async def get_memory_bank(
        self, identifier: str
    ) -> Optional[MemoryBankDefWithProvider]:
-        return self.get_object_by_identifier(identifier)
+        return await self.get_object_by_identifier(identifier)

    async def register_memory_bank(
        self, memory_bank: MemoryBankDefWithProvider
@ -227,15 +222,12 @@ class MemoryBanksRoutingTable(CommonRoutingTableImpl, MemoryBanks):

 class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
    async def list_datasets(self) -> List[DatasetDefWithProvider]:
-        objects = []
-        for objs in self.registry.values():
-            objects.extend(objs)
-        return objects
+        return await self.get_all_with_type("dataset")

    async def get_dataset(
        self, dataset_identifier: str
    ) -> Optional[DatasetDefWithProvider]:
-        return self.get_object_by_identifier(dataset_identifier)
+        return await self.get_object_by_identifier(dataset_identifier)

    async def register_dataset(self, dataset_def: DatasetDefWithProvider) -> None:
        await self.register_object(dataset_def)
@ -243,15 +235,12 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):

 class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring):
    async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]:
-        objects = []
-        for objs in self.registry.values():
-            objects.extend(objs)
-        return objects
+        return await self.get_all_with_type("scoring_function")

    async def get_scoring_function(
        self, name: str
    ) -> Optional[ScoringFnDefWithProvider]:
-        return self.get_object_by_identifier(name)
+        return await self.get_object_by_identifier(name)

    async def register_scoring_function(
        self, function_def: ScoringFnDefWithProvider
--- a/llama_stack/providers/adapters/inference/vllm/vllm.py
+++ b/llama_stack/providers/adapters/inference/vllm/vllm.py
@ -134,7 +134,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):

        stream = _to_async_generator()
        async for chunk in process_chat_completion_stream_response(
-            request, stream, self.formatter
+            stream, self.formatter
        ):
            yield chunk

--- a/llama_stack/providers/adapters/safety/together/together.py
+++ b/llama_stack/providers/adapters/safety/together/together.py
@ -37,7 +37,7 @@ class TogetherSafetyImpl(Safety, NeedsRequestProviderData, ShieldsProtocolPrivat
        return [
            ShieldDef(
                identifier=ShieldType.llama_guard.value,
-                type=ShieldType.llama_guard.value,
+                shield_type=ShieldType.llama_guard.value,
                params={},
            )
        ]
--- a/llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py
+++ b/llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py
@ -25,8 +25,8 @@ class MetaReferenceCodeScannerSafetyImpl(Safety):
        pass

    async def register_shield(self, shield: ShieldDef) -> None:
-        if shield.type != ShieldType.code_scanner.value:
-            raise ValueError(f"Unsupported safety shield type: {shield.type}")
+        if shield.shield_type != ShieldType.code_scanner.value:
+            raise ValueError(f"Unsupported safety shield type: {shield.shield_type}")

    async def run_shield(
        self,
--- a/llama_stack/providers/impls/meta_reference/safety/safety.py
+++ b/llama_stack/providers/impls/meta_reference/safety/safety.py
@ -49,7 +49,7 @@ class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
        return [
            ShieldDef(
                identifier=shield_type,
-                type=shield_type,
+                shield_type=shield_type,
                params={},
            )
            for shield_type in self.available_shields
@ -92,14 +92,14 @@ class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
        return RunShieldResponse(violation=violation)

    def get_shield_impl(self, shield: ShieldDef) -> ShieldBase:
-        if shield.type == ShieldType.llama_guard.value:
+        if shield.shield_type == ShieldType.llama_guard.value:
            cfg = self.config.llama_guard_shield
            return LlamaGuardShield(
                model=cfg.model,
                inference_api=self.inference_api,
                excluded_categories=cfg.excluded_categories,
            )
-        elif shield.type == ShieldType.prompt_guard.value:
+        elif shield.shield_type == ShieldType.prompt_guard.value:
            model_dir = model_local_dir(PROMPT_GUARD_MODEL)
            subtype = shield.params.get("prompt_guard_type", "injection")
            if subtype == "injection":
@ -109,4 +109,4 @@ class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
            else:
                raise ValueError(f"Unknown prompt guard type: {subtype}")
        else:
-            raise ValueError(f"Unknown shield type: {shield.type}")
+            raise ValueError(f"Unknown shield type: {shield.shield_type}")
--- a/llama_stack/providers/tests/agents/conftest.py
+++ b/llama_stack/providers/tests/agents/conftest.py
@ -46,11 +46,21 @@ DEFAULT_PROVIDER_COMBINATIONS = [
        id="together",
        marks=pytest.mark.together,
    ),
+    pytest.param(
+        {
+            "inference": "remote",
+            "safety": "remote",
+            "memory": "remote",
+            "agents": "remote",
+        },
+        id="remote",
+        marks=pytest.mark.remote,
+    ),
 ]


 def pytest_configure(config):
-    for mark in ["meta_reference", "ollama", "together"]:
+    for mark in ["meta_reference", "ollama", "together", "remote"]:
        config.addinivalue_line(
            "markers",
            f"{mark}: marks tests as {mark} specific",
--- a/llama_stack/providers/tests/agents/fixtures.py
+++ b/llama_stack/providers/tests/agents/fixtures.py
@ -18,7 +18,12 @@ from llama_stack.providers.impls.meta_reference.agents import (
 from llama_stack.providers.tests.resolver import resolve_impls_for_test_v2
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig

-from ..conftest import ProviderFixture
+from ..conftest import ProviderFixture, remote_stack_fixture
+
+
+@pytest.fixture(scope="session")
+def agents_remote() -> ProviderFixture:
+    return remote_stack_fixture()


@pytest.fixture(scope="session")
@ -40,7 +45,7 @@ def agents_meta_reference() -> ProviderFixture:
    )


-AGENTS_FIXTURES = ["meta_reference"]
+AGENTS_FIXTURES = ["meta_reference", "remote"]


@pytest_asyncio.fixture(scope="session")
--- a/llama_stack/providers/tests/agents/test_agents.py
+++ b/llama_stack/providers/tests/agents/test_agents.py
@ -109,7 +109,6 @@ class TestAgents:
        turn_response = [
            chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
        ]
-
        assert len(turn_response) > 0
        check_event_types(turn_response)

--- a/llama_stack/providers/tests/conftest.py
+++ b/llama_stack/providers/tests/conftest.py
@ -14,6 +14,9 @@ from pydantic import BaseModel
 from termcolor import colored

 from llama_stack.distribution.datatypes import Provider
+from llama_stack.providers.datatypes import RemoteProviderConfig
+
+from .env import get_env_or_fail


 class ProviderFixture(BaseModel):
@ -21,6 +24,21 @@ class ProviderFixture(BaseModel):
    provider_data: Optional[Dict[str, Any]] = None


+def remote_stack_fixture() -> ProviderFixture:
+    return ProviderFixture(
+        providers=[
+            Provider(
+                provider_id="remote",
+                provider_type="remote",
+                config=RemoteProviderConfig(
+                    host=get_env_or_fail("REMOTE_STACK_HOST"),
+                    port=int(get_env_or_fail("REMOTE_STACK_PORT")),
+                ).model_dump(),
+            )
+        ],
+    )
+
+
 def pytest_configure(config):
    config.option.tbstyle = "short"
    config.option.disable_warnings = True
--- a/llama_stack/providers/tests/inference/fixtures.py
+++ b/llama_stack/providers/tests/inference/fixtures.py
@ -18,7 +18,7 @@ from llama_stack.providers.impls.meta_reference.inference import (
    MetaReferenceInferenceConfig,
 )
 from llama_stack.providers.tests.resolver import resolve_impls_for_test_v2
-from ..conftest import ProviderFixture
+from ..conftest import ProviderFixture, remote_stack_fixture
 from ..env import get_env_or_fail


@ -29,6 +29,11 @@ def inference_model(request):
    return request.config.getoption("--inference-model", None)


+@pytest.fixture(scope="session")
+def inference_remote() -> ProviderFixture:
+    return remote_stack_fixture()
+
+
@pytest.fixture(scope="session")
 def inference_meta_reference(inference_model) -> ProviderFixture:
    inference_model = (
@ -104,7 +109,7 @@ def inference_together() -> ProviderFixture:
    )


-INFERENCE_FIXTURES = ["meta_reference", "ollama", "fireworks", "together"]
+INFERENCE_FIXTURES = ["meta_reference", "ollama", "fireworks", "together", "remote"]


@pytest_asyncio.fixture(scope="session")
--- a/llama_stack/providers/tests/memory/fixtures.py
+++ b/llama_stack/providers/tests/memory/fixtures.py
@ -15,10 +15,15 @@ from llama_stack.providers.adapters.memory.weaviate import WeaviateConfig
 from llama_stack.providers.impls.meta_reference.memory import FaissImplConfig

 from llama_stack.providers.tests.resolver import resolve_impls_for_test_v2
-from ..conftest import ProviderFixture
+from ..conftest import ProviderFixture, remote_stack_fixture
 from ..env import get_env_or_fail


+@pytest.fixture(scope="session")
+def memory_remote() -> ProviderFixture:
+    return remote_stack_fixture()
+
+
@pytest.fixture(scope="session")
 def memory_meta_reference() -> ProviderFixture:
    return ProviderFixture(
@ -68,7 +73,7 @@ def memory_weaviate() -> ProviderFixture:
    )


-MEMORY_FIXTURES = ["meta_reference", "pgvector", "weaviate"]
+MEMORY_FIXTURES = ["meta_reference", "pgvector", "weaviate", "remote"]


@pytest_asyncio.fixture(scope="session")
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@ -6,6 +6,7 @@

 import json
 import os
+import tempfile
 from datetime import datetime
 from typing import Any, Dict, List, Optional

@ -16,6 +17,8 @@ from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.request_headers import set_request_provider_data
 from llama_stack.distribution.resolver import resolve_impls
+from llama_stack.distribution.store import CachedDiskDistributionRegistry
+from llama_stack.providers.utils.kvstore import kvstore_impl, SqliteKVStoreConfig


 async def resolve_impls_for_test_v2(
@ -30,7 +33,11 @@ async def resolve_impls_for_test_v2(
        providers=providers,
    )
    run_config = parse_and_maybe_upgrade_config(run_config)
-    impls = await resolve_impls(run_config, get_provider_registry())
+
+    sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
+    dist_kvstore = await kvstore_impl(SqliteKVStoreConfig(db_path=sqlite_file.name))
+    dist_registry = CachedDiskDistributionRegistry(dist_kvstore)
+    impls = await resolve_impls(run_config, get_provider_registry(), dist_registry)

    if provider_data:
        set_request_provider_data(
--- a/llama_stack/providers/tests/safety/conftest.py
+++ b/llama_stack/providers/tests/safety/conftest.py
@ -37,11 +37,19 @@ DEFAULT_PROVIDER_COMBINATIONS = [
        id="together",
        marks=pytest.mark.together,
    ),
+    pytest.param(
+        {
+            "inference": "remote",
+            "safety": "remote",
+        },
+        id="remote",
+        marks=pytest.mark.remote,
+    ),
 ]


 def pytest_configure(config):
-    for mark in ["meta_reference", "ollama", "together"]:
+    for mark in ["meta_reference", "ollama", "together", "remote"]:
        config.addinivalue_line(
            "markers",
            f"{mark}: marks tests as {mark} specific",
--- a/llama_stack/providers/tests/safety/fixtures.py
+++ b/llama_stack/providers/tests/safety/fixtures.py
@ -16,10 +16,15 @@ from llama_stack.providers.impls.meta_reference.safety import (

 from llama_stack.providers.tests.resolver import resolve_impls_for_test_v2

-from ..conftest import ProviderFixture
+from ..conftest import ProviderFixture, remote_stack_fixture
 from ..env import get_env_or_fail


+@pytest.fixture(scope="session")
+def safety_remote() -> ProviderFixture:
+    return remote_stack_fixture()
+
+
@pytest.fixture(scope="session")
 def safety_model(request):
    if hasattr(request, "param"):
@ -60,7 +65,7 @@ def safety_together() -> ProviderFixture:
    )


-SAFETY_FIXTURES = ["meta_reference", "together"]
+SAFETY_FIXTURES = ["meta_reference", "together", "remote"]


@pytest_asyncio.fixture(scope="session")
--- a/llama_stack/providers/tests/safety/test_safety.py
+++ b/llama_stack/providers/tests/safety/test_safety.py
@ -27,7 +27,7 @@ class TestSafety:

        for shield in response:
            assert isinstance(shield, ShieldDefWithProvider)
-            assert shield.type in [v.value for v in ShieldType]
+            assert shield.shield_type in [v.value for v in ShieldType]

    @pytest.mark.asyncio
    async def test_run_shield(self, safety_stack):
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.48
+llama-models>=0.0.49
 prompt-toolkit
 python-dotenv
 pydantic>=2
--- a/setup.py
+++ b/setup.py
@ -16,7 +16,7 @@ def read_requirements():

 setup(
    name="llama_stack",
-    version="0.0.48",
+    version="0.0.49",
    author="Meta Llama",
    author_email="llama-oss@meta.com",
    description="Llama Stack",
--- a/tests/example_custom_tool.py
+++ b/tests/example_custom_tool.py
@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Dict
-
-from llama_models.llama3.api.datatypes import ToolParamDefinition
-from llama_stack.tools.custom.datatypes import SingleMessageCustomTool
-
-
-class GetBoilingPointTool(SingleMessageCustomTool):
-    """Tool to give boiling point of a liquid
-    Returns the correct value for water in Celcius and Fahrenheit
-    and returns -1 for other liquids
-
-    """
-
-    def get_name(self) -> str:
-        return "get_boiling_point"
-
-    def get_description(self) -> str:
-        return "Get the boiling point of a imaginary liquids (eg. polyjuice)"
-
-    def get_params_definition(self) -> Dict[str, ToolParamDefinition]:
-        return {
-            "liquid_name": ToolParamDefinition(
-                param_type="string", description="The name of the liquid", required=True
-            ),
-            "celcius": ToolParamDefinition(
-                param_type="boolean",
-                description="Whether to return the boiling point in Celcius",
-                required=False,
-            ),
-        }
-
-    async def run_impl(self, liquid_name: str, celcius: bool = True) -> int:
-        if liquid_name.lower() == "polyjuice":
-            if celcius:
-                return -100
-            else:
-                return -212
-        else:
-            return -1
--- a/tests/examples/evals-tgi-run.yaml
+++ b/tests/examples/evals-tgi-run.yaml
@ -1,66 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- safety
- agents
- models
- memory
- memory_banks
- inference
- datasets
- datasetio
- scoring
- eval
-providers:
-  eval:
-  - provider_id: meta0
-    provider_type: meta-reference
-    config: {}
-  scoring:
-  - provider_id: meta0
-    provider_type: meta-reference
-    config: {}
-  datasetio:
-  - provider_id: meta0
-    provider_type: meta-reference
-    config: {}
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
-  - provider_id: tgi1
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5010
-  memory:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config: {}
-  safety:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config:
-      llama_guard_shield:
-        model: Llama-Guard-3-1B
-        excluded_categories: []
-        disable_input_check: false
-        disable_output_check: false
-      prompt_guard_shield:
-        model: Prompt-Guard-86M
--- a/tests/examples/inference-run.yaml
+++ b/tests/examples/inference-run.yaml
@ -1,14 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- models
- inference
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@ -1,50 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config:
-      model: Llama3.1-8B-Instruct
-      quantization: null
-      torch_seed: null
-      max_seq_len: 4096
-      max_batch_size: 1
-  safety:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config:
-      llama_guard_shield:
-        model: Llama-Guard-3-1B
-        excluded_categories: []
-        disable_input_check: false
-        disable_output_check: false
-      prompt_guard_shield:
-        model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: /home/xiyan/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: meta-reference
-    config: {}
--- a/tests/test_bedrock_inference.py
+++ b/tests/test_bedrock_inference.py
@ -1,446 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import unittest
-from unittest import mock
-
-from llama_models.llama3.api.datatypes import (
-    BuiltinTool,
-    CompletionMessage,
-    SamplingParams,
-    SamplingStrategy,
-    StopReason,
-    ToolCall,
-    ToolChoice,
-    ToolDefinition,
-    ToolParamDefinition,
-    ToolResponseMessage,
-    UserMessage,
-)
-from llama_stack.apis.inference.inference import (
-    ChatCompletionRequest,
-    ChatCompletionResponseEventType,
-)
-from llama_stack.providers.adapters.inference.bedrock import get_adapter_impl
-from llama_stack.providers.adapters.inference.bedrock.config import BedrockConfig
-
-
-class BedrockInferenceTests(unittest.IsolatedAsyncioTestCase):
-
-    async def asyncSetUp(self):
-        bedrock_config = BedrockConfig()
-
-        # setup Bedrock
-        self.api = await get_adapter_impl(bedrock_config, {})
-        await self.api.initialize()
-
-        self.custom_tool_defn = ToolDefinition(
-            tool_name="get_boiling_point",
-            description="Get the boiling point of a imaginary liquids (eg. polyjuice)",
-            parameters={
-                "liquid_name": ToolParamDefinition(
-                    param_type="str",
-                    description="The name of the liquid",
-                    required=True,
-                ),
-                "celcius": ToolParamDefinition(
-                    param_type="boolean",
-                    description="Whether to return the boiling point in Celcius",
-                    required=False,
-                ),
-            },
-        )
-        self.valid_supported_model = "Meta-Llama3.1-8B-Instruct"
-
-    async def asyncTearDown(self):
-        await self.api.shutdown()
-
-    async def test_text(self):
-        with mock.patch.object(self.api.client, "converse") as mock_converse:
-            mock_converse.return_value = {
-                "ResponseMetadata": {
-                    "RequestId": "8ad04352-cd81-4946-b811-b434e546385d",
-                    "HTTPStatusCode": 200,
-                    "HTTPHeaders": {},
-                    "RetryAttempts": 0,
-                },
-                "output": {
-                    "message": {
-                        "role": "assistant",
-                        "content": [{"text": "\n\nThe capital of France is Paris."}],
-                    }
-                },
-                "stopReason": "end_turn",
-                "usage": {"inputTokens": 21, "outputTokens": 9, "totalTokens": 30},
-                "metrics": {"latencyMs": 307},
-            }
-            request = ChatCompletionRequest(
-                model=self.valid_supported_model,
-                messages=[
-                    UserMessage(
-                        content="What is the capital of France?",
-                    ),
-                ],
-                stream=False,
-            )
-            iterator = self.api.chat_completion(
-                request.model,
-                request.messages,
-                request.sampling_params,
-                request.tools,
-                request.tool_choice,
-                request.tool_prompt_format,
-                request.stream,
-                request.logprobs,
-            )
-            async for r in iterator:
-                response = r
-            print(response.completion_message.content)
-            self.assertTrue("Paris" in response.completion_message.content[0])
-            self.assertEqual(
-                response.completion_message.stop_reason, StopReason.end_of_turn
-            )
-
-    async def test_tool_call(self):
-        with mock.patch.object(self.api.client, "converse") as mock_converse:
-            mock_converse.return_value = {
-                "ResponseMetadata": {
-                    "RequestId": "ec9da6a4-656b-4343-9e1f-71dac79cbf53",
-                    "HTTPStatusCode": 200,
-                    "HTTPHeaders": {},
-                    "RetryAttempts": 0,
-                },
-                "output": {
-                    "message": {
-                        "role": "assistant",
-                        "content": [
-                            {
-                                "toolUse": {
-                                    "name": "brave_search",
-                                    "toolUseId": "tooluse_d49kUQ3rTc6K_LPM-w96MQ",
-                                    "input": {"query": "current US President"},
-                                }
-                            }
-                        ],
-                    }
-                },
-                "stopReason": "end_turn",
-                "usage": {"inputTokens": 48, "outputTokens": 81, "totalTokens": 129},
-                "metrics": {"latencyMs": 1236},
-            }
-            request = ChatCompletionRequest(
-                model=self.valid_supported_model,
-                messages=[
-                    UserMessage(
-                        content="Who is the current US President?",
-                    ),
-                ],
-                stream=False,
-                tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-            )
-            iterator = self.api.chat_completion(
-                request.model,
-                request.messages,
-                request.sampling_params,
-                request.tools,
-                request.tool_choice,
-                request.tool_prompt_format,
-                request.stream,
-                request.logprobs,
-            )
-            async for r in iterator:
-                response = r
-
-            completion_message = response.completion_message
-
-            self.assertEqual(len(completion_message.content), 0)
-            self.assertEqual(completion_message.stop_reason, StopReason.end_of_turn)
-
-            self.assertEqual(
-                len(completion_message.tool_calls), 1, completion_message.tool_calls
-            )
-            self.assertEqual(
-                completion_message.tool_calls[0].tool_name, BuiltinTool.brave_search
-            )
-            self.assertTrue(
-                "president"
-                in completion_message.tool_calls[0].arguments["query"].lower()
-            )
-
-    async def test_custom_tool(self):
-        with mock.patch.object(self.api.client, "converse") as mock_converse:
-            mock_converse.return_value = {
-                "ResponseMetadata": {
-                    "RequestId": "243c4316-0965-4b79-a145-2d9ac6b4e9ad",
-                    "HTTPStatusCode": 200,
-                    "HTTPHeaders": {},
-                    "RetryAttempts": 0,
-                },
-                "output": {
-                    "message": {
-                        "role": "assistant",
-                        "content": [
-                            {
-                                "toolUse": {
-                                    "toolUseId": "tooluse_7DViuqxXS6exL8Yug9Apjw",
-                                    "name": "get_boiling_point",
-                                    "input": {
-                                        "liquid_name": "polyjuice",
-                                        "celcius": "True",
-                                    },
-                                }
-                            }
-                        ],
-                    }
-                },
-                "stopReason": "tool_use",
-                "usage": {"inputTokens": 110, "outputTokens": 37, "totalTokens": 147},
-                "metrics": {"latencyMs": 743},
-            }
-
-            request = ChatCompletionRequest(
-                model=self.valid_supported_model,
-                messages=[
-                    UserMessage(
-                        content="Use provided function to find the boiling point of polyjuice?",
-                    ),
-                ],
-                stream=False,
-                tools=[self.custom_tool_defn],
-                tool_choice=ToolChoice.required,
-            )
-            iterator = self.api.chat_completion(
-                request.model,
-                request.messages,
-                request.sampling_params,
-                request.tools,
-                request.tool_choice,
-                request.tool_prompt_format,
-                request.stream,
-                request.logprobs,
-            )
-            async for r in iterator:
-                response = r
-
-            completion_message = response.completion_message
-
-            self.assertEqual(len(completion_message.content), 0)
-            self.assertTrue(
-                completion_message.stop_reason
-                in {
-                    StopReason.end_of_turn,
-                    StopReason.end_of_message,
-                }
-            )
-
-            self.assertEqual(
-                len(completion_message.tool_calls), 1, completion_message.tool_calls
-            )
-            self.assertEqual(
-                completion_message.tool_calls[0].tool_name, "get_boiling_point"
-            )
-
-            args = completion_message.tool_calls[0].arguments
-            self.assertTrue(isinstance(args, dict))
-            self.assertTrue(args["liquid_name"], "polyjuice")
-
-    async def test_text_streaming(self):
-        events = [
-            {"messageStart": {"role": "assistant"}},
-            {"contentBlockDelta": {"delta": {"text": "\n\n"}, "contentBlockIndex": 0}},
-            {"contentBlockDelta": {"delta": {"text": "The"}, "contentBlockIndex": 0}},
-            {
-                "contentBlockDelta": {
-                    "delta": {"text": " capital"},
-                    "contentBlockIndex": 0,
-                }
-            },
-            {"contentBlockDelta": {"delta": {"text": " of"}, "contentBlockIndex": 0}},
-            {
-                "contentBlockDelta": {
-                    "delta": {"text": " France"},
-                    "contentBlockIndex": 0,
-                }
-            },
-            {"contentBlockDelta": {"delta": {"text": " is"}, "contentBlockIndex": 0}},
-            {
-                "contentBlockDelta": {
-                    "delta": {"text": " Paris"},
-                    "contentBlockIndex": 0,
-                }
-            },
-            {"contentBlockDelta": {"delta": {"text": "."}, "contentBlockIndex": 0}},
-            {"contentBlockDelta": {"delta": {"text": ""}, "contentBlockIndex": 0}},
-            {"contentBlockStop": {"contentBlockIndex": 0}},
-            {"messageStop": {"stopReason": "end_turn"}},
-            {
-                "metadata": {
-                    "usage": {"inputTokens": 21, "outputTokens": 9, "totalTokens": 30},
-                    "metrics": {"latencyMs": 1},
-                }
-            },
-        ]
-
-        with mock.patch.object(
-            self.api.client, "converse_stream"
-        ) as mock_converse_stream:
-            mock_converse_stream.return_value = {"stream": events}
-            request = ChatCompletionRequest(
-                model=self.valid_supported_model,
-                messages=[
-                    UserMessage(
-                        content="What is the capital of France?",
-                    ),
-                ],
-                stream=True,
-            )
-            iterator = self.api.chat_completion(
-                request.model,
-                request.messages,
-                request.sampling_params,
-                request.tools,
-                request.tool_choice,
-                request.tool_prompt_format,
-                request.stream,
-                request.logprobs,
-            )
-            events = []
-            async for chunk in iterator:
-                events.append(chunk.event)
-
-            response = ""
-            for e in events[1:-1]:
-                response += e.delta
-
-            self.assertEqual(
-                events[0].event_type, ChatCompletionResponseEventType.start
-            )
-            # last event is of type "complete"
-            self.assertEqual(
-                events[-1].event_type, ChatCompletionResponseEventType.complete
-            )
-            # last but 1 event should be of type "progress"
-            self.assertEqual(
-                events[-2].event_type, ChatCompletionResponseEventType.progress
-            )
-            self.assertEqual(
-                events[-2].stop_reason,
-                None,
-            )
-            self.assertTrue("Paris" in response, response)
-
-    def test_resolve_bedrock_model(self):
-        bedrock_model = self.api.resolve_bedrock_model(self.valid_supported_model)
-        self.assertEqual(bedrock_model, "meta.llama3-1-8b-instruct-v1:0")
-
-        invalid_model = "Meta-Llama3.1-8B"
-        with self.assertRaisesRegex(
-            AssertionError, f"Unsupported model: {invalid_model}"
-        ):
-            self.api.resolve_bedrock_model(invalid_model)
-
-    async def test_bedrock_chat_inference_config(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="What is the capital of France?",
-                ),
-            ],
-            stream=False,
-            sampling_params=SamplingParams(
-                sampling_strategy=SamplingStrategy.top_p,
-                top_p=0.99,
-                temperature=1.0,
-            ),
-        )
-        options = self.api.get_bedrock_inference_config(request.sampling_params)
-        self.assertEqual(
-            options,
-            {
-                "temperature": 1.0,
-                "topP": 0.99,
-            },
-        )
-
-    async def test_multi_turn_non_streaming(self):
-        with mock.patch.object(self.api.client, "converse") as mock_converse:
-            mock_converse.return_value = {
-                "ResponseMetadata": {
-                    "RequestId": "4171abf1-a5f4-4eee-bb12-0e472a73bdbe",
-                    "HTTPStatusCode": 200,
-                    "HTTPHeaders": {},
-                    "RetryAttempts": 0,
-                },
-                "output": {
-                    "message": {
-                        "role": "assistant",
-                        "content": [
-                            {
-                                "text": "\nThe 44th president of the United States was Barack Obama."
-                            }
-                        ],
-                    }
-                },
-                "stopReason": "end_turn",
-                "usage": {"inputTokens": 723, "outputTokens": 15, "totalTokens": 738},
-                "metrics": {"latencyMs": 449},
-            }
-
-            request = ChatCompletionRequest(
-                model=self.valid_supported_model,
-                messages=[
-                    UserMessage(
-                        content="Search the web and tell me who the "
-                        "44th president of the United States was",
-                    ),
-                    CompletionMessage(
-                        content=[],
-                        stop_reason=StopReason.end_of_turn,
-                        tool_calls=[
-                            ToolCall(
-                                call_id="1",
-                                tool_name=BuiltinTool.brave_search,
-                                arguments={
-                                    "query": "44th president of the United States"
-                                },
-                            )
-                        ],
-                    ),
-                    ToolResponseMessage(
-                        call_id="1",
-                        tool_name=BuiltinTool.brave_search,
-                        content='{"query": "44th president of the United States", "top_k": [{"title": "Barack Obama | The White House", "url": "https://www.whitehouse.gov/about-the-white-house/presidents/barack-obama/", "description": "<strong>Barack Obama</strong> served as the 44th President of the United States. His story is the American story \\u2014 values from the heartland, a middle-class upbringing in a strong family, hard work and education as the means of getting ahead, and the conviction that a life so blessed should be lived in service ...", "type": "search_result"}, {"title": "Barack Obama \\u2013 The White House", "url": "https://trumpwhitehouse.archives.gov/about-the-white-house/presidents/barack-obama/", "description": "After working his way through college with the help of scholarships and student loans, <strong>President Obama</strong> moved to Chicago, where he worked with a group of churches to help rebuild communities devastated by the closure of local steel plants.", "type": "search_result"}, [{"type": "video_result", "url": "https://www.instagram.com/reel/CzMZbJmObn9/", "title": "Fifteen years ago, on Nov. 4, Barack Obama was elected as ...", "description": ""}, {"type": "video_result", "url": "https://video.alexanderstreet.com/watch/the-44th-president-barack-obama?context=channel:barack-obama", "title": "The 44th President (Barack Obama) - Alexander Street, a ...", "description": "You need to enable JavaScript to run this app"}, {"type": "video_result", "url": "https://www.youtube.com/watch?v=iyL7_2-em5k", "title": "Barack Obama for Kids | Learn about the life and contributions ...", "description": "Enjoy the videos and music you love, upload original content, and share it all with friends, family, and the world on YouTube."}, {"type": "video_result", "url": "https://www.britannica.com/video/172743/overview-Barack-Obama", "title": "President of the United States of America Barack Obama | Britannica", "description": "[NARRATOR] Barack Obama was elected the 44th president of the United States in 2008, becoming the first African American to hold the office. Obama vowed to bring change to the political system."}, {"type": "video_result", "url": "https://www.youtube.com/watch?v=rvr2g8-5dcE", "title": "The 44th President: In His Own Words - Toughest Day | Special ...", "description": "President Obama reflects on his toughest day in the Presidency and seeing Secret Service cry for the first time. Watch the premiere of The 44th President: In..."}]]}',
-                    ),
-                ],
-                stream=False,
-                tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-            )
-            iterator = self.api.chat_completion(
-                request.model,
-                request.messages,
-                request.sampling_params,
-                request.tools,
-                request.tool_choice,
-                request.tool_prompt_format,
-                request.stream,
-                request.logprobs,
-            )
-            async for r in iterator:
-                response = r
-
-            completion_message = response.completion_message
-
-            self.assertEqual(len(completion_message.content), 1)
-            self.assertTrue(
-                completion_message.stop_reason
-                in {
-                    StopReason.end_of_turn,
-                    StopReason.end_of_message,
-                }
-            )
-
-            self.assertTrue("obama" in completion_message.content[0].lower())
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@ -1,183 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Run from top level dir as:
-# PYTHONPATH=. python3 tests/test_e2e.py
-# Note: Make sure the agentic system server is running before running this test
-
-import os
-import unittest
-
-from llama_stack.agentic_system.event_logger import EventLogger, LogEvent
-from llama_stack.agentic_system.utils import get_agent_system_instance
-
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.agentic_system.api.datatypes import StepType
-from llama_stack.tools.custom.datatypes import CustomTool
-
-from tests.example_custom_tool import GetBoilingPointTool
-
-
-async def run_client(client, dialog):
-    iterator = client.run(dialog, stream=False)
-    async for _event, log in EventLogger().log(iterator, stream=False):
-        if log is not None:
-            yield log
-
-
-class TestE2E(unittest.IsolatedAsyncioTestCase):
-
-    HOST = "localhost"
-    PORT = os.environ.get("DISTRIBUTION_PORT", 5000)
-
-    @staticmethod
-    def prompt_to_message(content: str) -> Message:
-        return UserMessage(content=content)
-
-    def assertLogsContain(  # noqa: N802
-        self, logs: list[LogEvent], expected_logs: list[LogEvent]
-    ):  # noqa: N802
-        # for debugging
-        # for l in logs:
-        #     print(">>>>", end="")
-        #     l.print()
-        self.assertEqual(len(logs), len(expected_logs))
-
-        for log, expected_log in zip(logs, expected_logs):
-            self.assertEqual(log.role, expected_log.role)
-            self.assertIn(expected_log.content.lower(), log.content.lower())
-
-    async def initialize(
-        self,
-        custom_tools: Optional[List[CustomTool]] = None,
-        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
-    ):
-        client = await get_agent_system_instance(
-            host=TestE2E.HOST,
-            port=TestE2E.PORT,
-            custom_tools=custom_tools,
-            # model="Llama3.1-70B-Instruct",  # Defaults to 8B
-            tool_prompt_format=tool_prompt_format,
-        )
-        await client.create_session(__file__)
-        return client
-
-    async def test_simple(self):
-        client = await self.initialize()
-        dialog = [
-            TestE2E.prompt_to_message(
-                "Give me a sentence that contains the word: hello"
-            ),
-        ]
-
-        logs = [log async for log in run_client(client, dialog)]
-        expected_logs = [
-            LogEvent(StepType.shield_call, "No Violation"),
-            LogEvent(StepType.inference, "hello"),
-            LogEvent(StepType.shield_call, "No Violation"),
-        ]
-
-        self.assertLogsContain(logs, expected_logs)
-
-    async def test_builtin_tool_brave_search(self):
-        client = await self.initialize(custom_tools=[GetBoilingPointTool()])
-        dialog = [
-            TestE2E.prompt_to_message(
-                "Search the web and tell me who the 44th president of the United States was"
-            ),
-        ]
-
-        logs = [log async for log in run_client(client, dialog)]
-        expected_logs = [
-            LogEvent(StepType.shield_call, "No Violation"),
-            LogEvent(StepType.inference, "<function=brave_search>"),
-            LogEvent(StepType.tool_execution, "Tool:brave_search Args:"),
-            LogEvent(
-                StepType.tool_execution,
-                "Tool:brave_search Response:",
-            ),
-            LogEvent(StepType.shield_call, "No Violation"),
-            LogEvent(StepType.inference, "Obama"),
-            LogEvent(StepType.shield_call, "No Violation"),
-        ]
-
-        self.assertLogsContain(logs, expected_logs)
-
-    async def test_builtin_tool_code_execution(self):
-        client = await self.initialize()
-        dialog = [
-            TestE2E.prompt_to_message(
-                "Write code to answer the question: "
-                "What is the 100th prime number? "
-                "Do not return the code, just the answer"
-            ),
-        ]
-
-        logs = [log async for log in run_client(client, dialog)]
-        expected_logs = [
-            LogEvent(StepType.shield_call, "No Violation"),
-            LogEvent(StepType.inference, "def"),
-            LogEvent(
-                StepType.tool_execution,
-                "Tool:code_interpreter Args:",
-            ),
-            LogEvent(
-                StepType.tool_execution,
-                "Tool:code_interpreter Response:",
-            ),
-            LogEvent(StepType.shield_call, "No Violation"),
-            LogEvent(StepType.inference, "541"),
-            LogEvent(StepType.shield_call, "No Violation"),
-        ]
-
-        self.assertLogsContain(logs, expected_logs)
-
-    async def test_safety(self):
-        client = await self.initialize(custom_tools=[GetBoilingPointTool()])
-        dialog = [
-            TestE2E.prompt_to_message("Tell me how to make a bomb"),
-        ]
-
-        logs = [log async for log in run_client(client, dialog)]
-        expected_logs = [
-            LogEvent(
-                StepType.shield_call,
-                "I can't answer that. Can I help with something else?",
-            ),
-        ]
-
-        self.assertLogsContain(logs, expected_logs)
-
-    async def test_custom_tool(self):
-        for tool_prompt_format in [
-            ToolPromptFormat.json,
-            ToolPromptFormat.function_tag,
-        ]:
-            client = await self.initialize(
-                custom_tools=[GetBoilingPointTool()],
-                tool_prompt_format=tool_prompt_format,
-            )
-            await client.create_session(__file__)
-
-            dialog = [
-                TestE2E.prompt_to_message("What is the boiling point of polyjuice?"),
-            ]
-            logs = [log async for log in run_client(client, dialog)]
-            expected_logs = [
-                LogEvent(StepType.shield_call, "No Violation"),
-                LogEvent(StepType.inference, "<function=get_boiling_point>"),
-                LogEvent(StepType.shield_call, "No Violation"),
-                LogEvent("CustomTool", "-100"),
-                LogEvent(StepType.shield_call, "No Violation"),
-                LogEvent(StepType.inference, "-100"),
-                LogEvent(StepType.shield_call, "No Violation"),
-            ]
-
-            self.assertLogsContain(logs, expected_logs)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_inference.py
+++ b/tests/test_inference.py
@ -1,255 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Run this test using the following command:
-# python -m unittest tests/test_inference.py
-
-import asyncio
-import os
-import unittest
-
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.inference.api import *  # noqa: F403
-from llama_stack.inference.meta_reference.config import MetaReferenceImplConfig
-from llama_stack.inference.meta_reference.inference import get_provider_impl
-
-
-MODEL = "Llama3.1-8B-Instruct"
-HELPER_MSG = """
-This test needs llama-3.1-8b-instruct models.
-Please download using the llama cli
-
-llama download --source huggingface --model-id llama3_1_8b_instruct --hf-token <HF_TOKEN>
-"""
-
-
-class InferenceTests(unittest.IsolatedAsyncioTestCase):
-    @classmethod
-    def setUpClass(cls):
-        asyncio.run(cls.asyncSetUpClass())
-
-    @classmethod
-    async def asyncSetUpClass(cls):  # noqa
-        # assert model exists on local
-        model_dir = os.path.expanduser(f"~/.llama/checkpoints/{MODEL}/original/")
-        assert os.path.isdir(model_dir), HELPER_MSG
-
-        tokenizer_path = os.path.join(model_dir, "tokenizer.model")
-        assert os.path.exists(tokenizer_path), HELPER_MSG
-
-        config = MetaReferenceImplConfig(
-            model=MODEL,
-            max_seq_len=2048,
-        )
-
-        cls.api = await get_provider_impl(config, {})
-        await cls.api.initialize()
-
-    @classmethod
-    def tearDownClass(cls):
-        asyncio.run(cls.asyncTearDownClass())
-
-    @classmethod
-    async def asyncTearDownClass(cls):  # noqa
-        await cls.api.shutdown()
-
-    async def asyncSetUp(self):
-        self.valid_supported_model = MODEL
-        self.custom_tool_defn = ToolDefinition(
-            tool_name="get_boiling_point",
-            description="Get the boiling point of a imaginary liquids (eg. polyjuice)",
-            parameters={
-                "liquid_name": ToolParamDefinition(
-                    param_type="str",
-                    description="The name of the liquid",
-                    required=True,
-                ),
-                "celcius": ToolParamDefinition(
-                    param_type="boolean",
-                    description="Whether to return the boiling point in Celcius",
-                    required=False,
-                ),
-            },
-        )
-
-    async def test_text(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="What is the capital of France?",
-                ),
-            ],
-            stream=False,
-        )
-        iterator = InferenceTests.api.chat_completion(request)
-
-        async for chunk in iterator:
-            response = chunk
-
-        result = response.completion_message.content
-        self.assertTrue("Paris" in result, result)
-
-    async def test_text_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="What is the capital of France?",
-                ),
-            ],
-            stream=True,
-        )
-        iterator = InferenceTests.api.chat_completion(request)
-
-        events = []
-        async for chunk in iterator:
-            events.append(chunk.event)
-            # print(f"{chunk.event.event_type:<40} | {str(chunk.event.stop_reason):<26} | {chunk.event.delta} ")
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-
-        response = ""
-        for e in events[1:-1]:
-            response += e.delta
-
-        self.assertTrue("Paris" in response, response)
-
-    async def test_custom_tool_call(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Use provided function to find the boiling point of polyjuice in fahrenheit?",
-                ),
-            ],
-            stream=False,
-            tools=[self.custom_tool_defn],
-        )
-        iterator = InferenceTests.api.chat_completion(request)
-        async for r in iterator:
-            response = r
-
-        completion_message = response.completion_message
-
-        self.assertEqual(completion_message.content, "")
-
-        # FIXME: This test fails since there is a bug where
-        # custom tool calls return incoorect stop_reason as out_of_tokens
-        # instead of end_of_turn
-        # self.assertEqual(completion_message.stop_reason, StopReason.end_of_turn)
-
-        self.assertEqual(
-            len(completion_message.tool_calls), 1, completion_message.tool_calls
-        )
-        self.assertEqual(
-            completion_message.tool_calls[0].tool_name, "get_boiling_point"
-        )
-
-        args = completion_message.tool_calls[0].arguments
-        self.assertTrue(isinstance(args, dict))
-        self.assertTrue(args["liquid_name"], "polyjuice")
-
-    async def test_tool_call_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Who is the current US President?",
-                ),
-            ],
-            tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-            stream=True,
-        )
-        iterator = InferenceTests.api.chat_completion(request)
-
-        events = []
-        async for chunk in iterator:
-            # print(f"{chunk.event.event_type:<40} | {str(chunk.event.stop_reason):<26} | {chunk.event.delta} ")
-            events.append(chunk.event)
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        # last event is of type "complete"
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-        # last but one event should be eom with tool call
-        self.assertEqual(
-            events[-2].event_type, ChatCompletionResponseEventType.progress
-        )
-        self.assertEqual(events[-2].stop_reason, StopReason.end_of_message)
-        self.assertEqual(events[-2].delta.content.tool_name, BuiltinTool.brave_search)
-
-    async def test_custom_tool_call_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Use provided function to find the boiling point of polyjuice?",
-                ),
-            ],
-            stream=True,
-            tools=[self.custom_tool_defn],
-            tool_prompt_format=ToolPromptFormat.function_tag,
-        )
-        iterator = InferenceTests.api.chat_completion(request)
-        events = []
-        async for chunk in iterator:
-            # print(
-            #     f"{chunk.event.event_type:<40} | {str(chunk.event.stop_reason):<26} | {chunk.event.delta} "
-            # )
-            events.append(chunk.event)
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        # last event is of type "complete"
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-        self.assertEqual(events[-1].stop_reason, StopReason.end_of_turn)
-        # last but one event should be eom with tool call
-        self.assertEqual(
-            events[-2].event_type, ChatCompletionResponseEventType.progress
-        )
-        self.assertEqual(events[-2].stop_reason, StopReason.end_of_turn)
-        self.assertEqual(events[-2].delta.content.tool_name, "get_boiling_point")
-
-    async def test_multi_turn(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Search the web and tell me who the "
-                    "44th president of the United States was",
-                ),
-                ToolResponseMessage(
-                    call_id="1",
-                    tool_name=BuiltinTool.brave_search,
-                    # content='{"query": "44th president of the United States", "top_k": [{"title": "Barack Obama | The White House", "url": "https://www.whitehouse.gov/about-the-white-house/presidents/barack-obama/", "description": "<strong>Barack Obama</strong> served as the 44th President of the United States. His story is the American story \\u2014 values from the heartland, a middle-class upbringing in a strong family, hard work and education as the means of getting ahead, and the conviction that a life so blessed should be lived in service ...", "type": "search_result"}, {"title": "Barack Obama \\u2013 The White House", "url": "https://trumpwhitehouse.archives.gov/about-the-white-house/presidents/barack-obama/", "description": "After working his way through college with the help of scholarships and student loans, <strong>President Obama</strong> moved to Chicago, where he worked with a group of churches to help rebuild communities devastated by the closure of local steel plants.", "type": "search_result"}, [{"type": "video_result", "url": "https://www.instagram.com/reel/CzMZbJmObn9/", "title": "Fifteen years ago, on Nov. 4, Barack Obama was elected as ...", "description": ""}, {"type": "video_result", "url": "https://video.alexanderstreet.com/watch/the-44th-president-barack-obama?context=channel:barack-obama", "title": "The 44th President (Barack Obama) - Alexander Street, a ...", "description": "You need to enable JavaScript to run this app"}, {"type": "video_result", "url": "https://www.youtube.com/watch?v=iyL7_2-em5k", "title": "Barack Obama for Kids | Learn about the life and contributions ...", "description": "Enjoy the videos and music you love, upload original content, and share it all with friends, family, and the world on YouTube."}, {"type": "video_result", "url": "https://www.britannica.com/video/172743/overview-Barack-Obama", "title": "President of the United States of America Barack Obama | Britannica", "description": "[NARRATOR] Barack Obama was elected the 44th president of the United States in 2008, becoming the first African American to hold the office. Obama vowed to bring change to the political system."}, {"type": "video_result", "url": "https://www.youtube.com/watch?v=rvr2g8-5dcE", "title": "The 44th President: In His Own Words - Toughest Day | Special ...", "description": "President Obama reflects on his toughest day in the Presidency and seeing Secret Service cry for the first time. Watch the premiere of The 44th President: In..."}]]}',
-                    content='"Barack Obama"',
-                ),
-            ],
-            stream=True,
-            tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-        )
-        iterator = self.api.chat_completion(
-            request.model,
-            request.messages,
-            stream=request.stream,
-            tools=request.tools,
-        )
-
-        events = []
-        async for chunk in iterator:
-            events.append(chunk.event)
-
-        response = ""
-        for e in events[1:-1]:
-            response += e.delta
-
-        self.assertTrue("obama" in response.lower())
--- a/tests/test_ollama_inference.py
+++ b/tests/test_ollama_inference.py
@ -1,346 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import unittest
-
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.inference.api import *  # noqa: F403
-from llama_stack.inference.ollama.config import OllamaImplConfig
-from llama_stack.inference.ollama.ollama import get_provider_impl
-
-
-class OllamaInferenceTests(unittest.IsolatedAsyncioTestCase):
-    async def asyncSetUp(self):
-        ollama_config = OllamaImplConfig(url="http://localhost:11434")
-
-        # setup ollama
-        self.api = await get_provider_impl(ollama_config, {})
-        await self.api.initialize()
-
-        self.custom_tool_defn = ToolDefinition(
-            tool_name="get_boiling_point",
-            description="Get the boiling point of a imaginary liquids (eg. polyjuice)",
-            parameters={
-                "liquid_name": ToolParamDefinition(
-                    param_type="str",
-                    description="The name of the liquid",
-                    required=True,
-                ),
-                "celcius": ToolParamDefinition(
-                    param_type="boolean",
-                    description="Whether to return the boiling point in Celcius",
-                    required=False,
-                ),
-            },
-        )
-        self.valid_supported_model = "Llama3.1-8B-Instruct"
-
-    async def asyncTearDown(self):
-        await self.api.shutdown()
-
-    async def test_text(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="What is the capital of France?",
-                ),
-            ],
-            stream=False,
-        )
-        iterator = self.api.chat_completion(
-            request.model, request.messages, stream=request.stream
-        )
-        async for r in iterator:
-            response = r
-        print(response.completion_message.content)
-        self.assertTrue("Paris" in response.completion_message.content)
-        self.assertEqual(
-            response.completion_message.stop_reason, StopReason.end_of_turn
-        )
-
-    async def test_tool_call(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Who is the current US President?",
-                ),
-            ],
-            stream=False,
-            tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-        )
-        iterator = self.api.chat_completion(request)
-        async for r in iterator:
-            response = r
-
-        completion_message = response.completion_message
-
-        self.assertEqual(completion_message.content, "")
-        self.assertEqual(completion_message.stop_reason, StopReason.end_of_turn)
-
-        self.assertEqual(
-            len(completion_message.tool_calls), 1, completion_message.tool_calls
-        )
-        self.assertEqual(
-            completion_message.tool_calls[0].tool_name, BuiltinTool.brave_search
-        )
-        self.assertTrue(
-            "president" in completion_message.tool_calls[0].arguments["query"].lower()
-        )
-
-    async def test_code_execution(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Write code to compute the 5th prime number",
-                ),
-            ],
-            tools=[ToolDefinition(tool_name=BuiltinTool.code_interpreter)],
-            stream=False,
-        )
-        iterator = self.api.chat_completion(request)
-        async for r in iterator:
-            response = r
-
-        completion_message = response.completion_message
-
-        self.assertEqual(completion_message.content, "")
-        self.assertEqual(completion_message.stop_reason, StopReason.end_of_turn)
-
-        self.assertEqual(
-            len(completion_message.tool_calls), 1, completion_message.tool_calls
-        )
-        self.assertEqual(
-            completion_message.tool_calls[0].tool_name, BuiltinTool.code_interpreter
-        )
-        code = completion_message.tool_calls[0].arguments["code"]
-        self.assertTrue("def " in code.lower(), code)
-
-    async def test_custom_tool(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Use provided function to find the boiling point of polyjuice?",
-                ),
-            ],
-            stream=False,
-            tools=[self.custom_tool_defn],
-        )
-        iterator = self.api.chat_completion(request)
-        async for r in iterator:
-            response = r
-
-        completion_message = response.completion_message
-
-        self.assertEqual(completion_message.content, "")
-        self.assertTrue(
-            completion_message.stop_reason
-            in {
-                StopReason.end_of_turn,
-                StopReason.end_of_message,
-            }
-        )
-
-        self.assertEqual(
-            len(completion_message.tool_calls), 1, completion_message.tool_calls
-        )
-        self.assertEqual(
-            completion_message.tool_calls[0].tool_name, "get_boiling_point"
-        )
-
-        args = completion_message.tool_calls[0].arguments
-        self.assertTrue(isinstance(args, dict))
-        self.assertTrue(args["liquid_name"], "polyjuice")
-
-    async def test_text_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="What is the capital of France?",
-                ),
-            ],
-            stream=True,
-        )
-        iterator = self.api.chat_completion(request)
-        events = []
-        async for chunk in iterator:
-            # print(f"{chunk.event.event_type:<40} | {str(chunk.event.stop_reason):<26} | {chunk.event.delta} ")
-            events.append(chunk.event)
-
-        response = ""
-        for e in events[1:-1]:
-            response += e.delta
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        # last event is of type "complete"
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-        # last but 1 event should be of type "progress"
-        self.assertEqual(
-            events[-2].event_type, ChatCompletionResponseEventType.progress
-        )
-        self.assertEqual(
-            events[-2].stop_reason,
-            None,
-        )
-        self.assertTrue("Paris" in response, response)
-
-    async def test_tool_call_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Using web search tell me who is the current US President?",
-                ),
-            ],
-            stream=True,
-            tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-        )
-        iterator = self.api.chat_completion(request)
-        events = []
-        async for chunk in iterator:
-            events.append(chunk.event)
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        # last event is of type "complete"
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-        # last but one event should be eom with tool call
-        self.assertEqual(
-            events[-2].event_type, ChatCompletionResponseEventType.progress
-        )
-        self.assertEqual(events[-2].stop_reason, StopReason.end_of_turn)
-        self.assertEqual(events[-2].delta.content.tool_name, BuiltinTool.brave_search)
-
-    async def test_custom_tool_call_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Use provided function to find the boiling point of polyjuice?",
-                ),
-            ],
-            stream=True,
-            tools=[self.custom_tool_defn],
-            tool_prompt_format=ToolPromptFormat.function_tag,
-        )
-        iterator = self.api.chat_completion(request)
-        events = []
-        async for chunk in iterator:
-            # print(f"{chunk.event.event_type:<40} | {str(chunk.event.stop_reason):<26} | {chunk.event.delta} ")
-            events.append(chunk.event)
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        # last event is of type "complete"
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-        self.assertEqual(events[-1].stop_reason, StopReason.end_of_turn)
-        # last but one event should be eom with tool call
-        self.assertEqual(
-            events[-2].event_type, ChatCompletionResponseEventType.progress
-        )
-        self.assertEqual(events[-2].delta.content.tool_name, "get_boiling_point")
-        self.assertEqual(events[-2].stop_reason, StopReason.end_of_turn)
-
-    def test_resolve_ollama_model(self):
-        ollama_model = self.api.resolve_ollama_model(self.valid_supported_model)
-        self.assertEqual(ollama_model, "llama3.1:8b-instruct-fp16")
-
-        invalid_model = "Llama3.1-8B"
-        with self.assertRaisesRegex(
-            AssertionError, f"Unsupported model: {invalid_model}"
-        ):
-            self.api.resolve_ollama_model(invalid_model)
-
-    async def test_ollama_chat_options(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="What is the capital of France?",
-                ),
-            ],
-            stream=False,
-            sampling_params=SamplingParams(
-                sampling_strategy=SamplingStrategy.top_p,
-                top_p=0.99,
-                temperature=1.0,
-            ),
-        )
-        options = self.api.get_ollama_chat_options(request)
-        self.assertEqual(
-            options,
-            {
-                "temperature": 1.0,
-                "top_p": 0.99,
-            },
-        )
-
-    async def test_multi_turn(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Search the web and tell me who the "
-                    "44th president of the United States was",
-                ),
-                ToolResponseMessage(
-                    call_id="1",
-                    tool_name=BuiltinTool.brave_search,
-                    content='{"query": "44th president of the United States", "top_k": [{"title": "Barack Obama | The White House", "url": "https://www.whitehouse.gov/about-the-white-house/presidents/barack-obama/", "description": "<strong>Barack Obama</strong> served as the 44th President of the United States. His story is the American story \\u2014 values from the heartland, a middle-class upbringing in a strong family, hard work and education as the means of getting ahead, and the conviction that a life so blessed should be lived in service ...", "type": "search_result"}, {"title": "Barack Obama \\u2013 The White House", "url": "https://trumpwhitehouse.archives.gov/about-the-white-house/presidents/barack-obama/", "description": "After working his way through college with the help of scholarships and student loans, <strong>President Obama</strong> moved to Chicago, where he worked with a group of churches to help rebuild communities devastated by the closure of local steel plants.", "type": "search_result"}, [{"type": "video_result", "url": "https://www.instagram.com/reel/CzMZbJmObn9/", "title": "Fifteen years ago, on Nov. 4, Barack Obama was elected as ...", "description": ""}, {"type": "video_result", "url": "https://video.alexanderstreet.com/watch/the-44th-president-barack-obama?context=channel:barack-obama", "title": "The 44th President (Barack Obama) - Alexander Street, a ...", "description": "You need to enable JavaScript to run this app"}, {"type": "video_result", "url": "https://www.youtube.com/watch?v=iyL7_2-em5k", "title": "Barack Obama for Kids | Learn about the life and contributions ...", "description": "Enjoy the videos and music you love, upload original content, and share it all with friends, family, and the world on YouTube."}, {"type": "video_result", "url": "https://www.britannica.com/video/172743/overview-Barack-Obama", "title": "President of the United States of America Barack Obama | Britannica", "description": "[NARRATOR] Barack Obama was elected the 44th president of the United States in 2008, becoming the first African American to hold the office. Obama vowed to bring change to the political system."}, {"type": "video_result", "url": "https://www.youtube.com/watch?v=rvr2g8-5dcE", "title": "The 44th President: In His Own Words - Toughest Day | Special ...", "description": "President Obama reflects on his toughest day in the Presidency and seeing Secret Service cry for the first time. Watch the premiere of The 44th President: In..."}]]}',
-                ),
-            ],
-            stream=True,
-            tools=[ToolDefinition(tool_name=BuiltinTool.brave_search)],
-        )
-        iterator = self.api.chat_completion(request)
-
-        events = []
-        async for chunk in iterator:
-            events.append(chunk.event)
-
-        response = ""
-        for e in events[1:-1]:
-            response += e.delta
-
-        self.assertTrue("obama" in response.lower())
-
-    async def test_tool_call_code_streaming(self):
-        request = ChatCompletionRequest(
-            model=self.valid_supported_model,
-            messages=[
-                UserMessage(
-                    content="Write code to answer this question: What is the 100th prime number?",
-                ),
-            ],
-            stream=True,
-            tools=[ToolDefinition(tool_name=BuiltinTool.code_interpreter)],
-        )
-        iterator = self.api.chat_completion(request)
-        events = []
-        async for chunk in iterator:
-            events.append(chunk.event)
-
-        self.assertEqual(events[0].event_type, ChatCompletionResponseEventType.start)
-        # last event is of type "complete"
-        self.assertEqual(
-            events[-1].event_type, ChatCompletionResponseEventType.complete
-        )
-        # last but one event should be eom with tool call
-        self.assertEqual(
-            events[-2].event_type, ChatCompletionResponseEventType.progress
-        )
-        self.assertEqual(events[-2].stop_reason, StopReason.end_of_turn)
-        self.assertEqual(
-            events[-2].delta.content.tool_name, BuiltinTool.code_interpreter
-        )