Merge branch 'main' into add-mcp-authentication-param

2025-12-03 18:00:36 +00:00 · 2025-11-07 14:26:06 -08:00 · 2025-11-07 14:26:06 -08:00 · 1a7ba683e3
commit 1a7ba683e3
parent 9e972cf20c 8f4c431370
1075 changed files with 125472 additions and 3083 deletions
--- a/src/llama_stack/init.py
+++ b/src/llama_stack/init.py
@ -3,8 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-from llama_stack.core.library_client import (  # noqa: F401
-    AsyncLlamaStackAsLibraryClient,
-    LlamaStackAsLibraryClient,
-)
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -403,7 +403,7 @@ class OpenAIResponseText(BaseModel):


 # Must match type Literals of OpenAIResponseInputToolWebSearch below
-WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
+WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11", "web_search_2025_08_26"]


@json_schema_type
@ -415,9 +415,12 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
    """

    # Must match values of WebSearchToolTypes above
-    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
-        "web_search"
-    )
+    type: (
+        Literal["web_search"]
+        | Literal["web_search_preview"]
+        | Literal["web_search_preview_2025_03_11"]
+        | Literal["web_search_2025_08_26"]
+    ) = "web_search"
    # TODO: actually use search_context_size somewhere...
    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
    # TODO: add user_location
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
@ -34,3 +34,44 @@ class PaginatedResponse(BaseModel):
    data: list[dict[str, Any]]
    has_more: bool
    url: str | None = None
+
+
+# This is a short term solution to allow inference API to return metrics
+# The ideal way to do this is to have a way for all response types to include metrics
+# and all metric events logged to the telemetry API to be included with the response
+# To do this, we will need to augment all response types with a metrics field.
+# We have hit a blocker from stainless SDK that prevents us from doing this.
+# The blocker is that if we were to augment the response types that have a data field
+# in them like so
+# class ListModelsResponse(BaseModel):
+# metrics: Optional[List[MetricEvent]] = None
+# data: List[Models]
+# ...
+# The client SDK will need to access the data by using a .data field, which is not
+# ergonomic. Stainless SDK does support unwrapping the response type, but it
+# requires that the response type to only have a single field.
+
+# We will need a way in the client SDK to signal that the metrics are needed
+# and if they are needed, the client SDK has to return the full response type
+# without unwrapping it.
+
+
+@json_schema_type
+class MetricInResponse(BaseModel):
+    """A metric value included in API responses.
+    :param metric: The name of the metric
+    :param value: The numeric value of the metric
+    :param unit: (Optional) The unit of measurement for the metric value
+    """
+
+    metric: str
+    value: int | float
+    unit: str | None = None
+
+
+class MetricResponseMixin(BaseModel):
+    """Mixin class for API responses that can include metrics.
+    :param metrics: (Optional) List of metrics associated with the API response
+    """
+
+    metrics: list[MetricInResponse] | None = None
--- a/src/llama_stack/apis/common/tracing.py
+++ b/src/llama_stack/apis/common/tracing.py
@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+def telemetry_traceable(cls):
+    """
+    Mark a protocol for automatic tracing when telemetry is enabled.
+
+    This is a metadata-only decorator with no dependencies on core.
+    Actual tracing is applied by core routers at runtime if telemetry is enabled.
+
+    Usage:
+        @runtime_checkable
+        @telemetry_traceable
+        class MyProtocol(Protocol):
+            ...
+    """
+    cls.__marked_for_tracing__ = True
+    return cls
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
@ -20,8 +20,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 Metadata = dict[str, str]
@ -157,7 +157,7 @@ class ConversationItemDeletedResource(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Conversations(Protocol):
    """Conversations

--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -11,8 +11,8 @@ from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -102,7 +102,7 @@ class OpenAIFileDeleteResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Files(Protocol):
    """Files

--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -19,11 +19,10 @@ from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict

 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
-from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.responses import MetricResponseMixin, Order
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.models import Model
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.core.telemetry.telemetry import MetricResponseMixin
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -1160,7 +1159,7 @@ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class InferenceProvider(Protocol):
    """
    This protocol defines the interface that should be implemented by all inference providers.
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -9,9 +9,9 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel, ConfigDict, Field, field_validator

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -105,7 +105,7 @@ class OpenAIListModelsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Models(Protocol):
    async def list_models(self) -> ListModelsResponse:
        """List all models.
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
@ -10,8 +10,8 @@ from typing import Protocol, runtime_checkable

 from pydantic import BaseModel, Field, field_validator, model_validator

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -92,7 +92,7 @@ class ListPromptsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Prompts(Protocol):
    """Prompts

--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -9,10 +9,10 @@ from typing import Any, Protocol, runtime_checkable

 from pydantic import BaseModel, Field

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -94,7 +94,7 @@ class ShieldStore(Protocol):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Safety(Protocol):
    """Safety

--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -8,9 +8,9 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -48,7 +48,7 @@ class ListShieldsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Shields(Protocol):
    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
    async def list_shields(self) -> ListShieldsResponse:
--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -11,9 +11,9 @@ from pydantic import BaseModel
 from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -107,7 +107,7 @@ class ListToolDefsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolGroups(Protocol):
    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
    async def register_tool_group(
@ -189,7 +189,7 @@ class SpecialToolGroup(Enum):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolRuntime(Protocol):
    tool_store: ToolStore | None = None

--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -13,10 +13,10 @@ from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from fastapi import Body
 from pydantic import BaseModel, Field

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema

@ -260,7 +260,7 @@ class VectorStoreSearchResponsePage(BaseModel):
    """

    object: str = "vector_store.search_results.page"
-    search_query: str
+    search_query: list[str]
    data: list[VectorStoreSearchResponse]
    has_more: bool = False
    next_page: str | None = None
@ -478,7 +478,7 @@ class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
    name: str | None = None
    file_ids: list[str] | None = None
    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
+    chunking_strategy: VectorStoreChunkingStrategy | None = None
    metadata: dict[str, Any] | None = None


@ -502,7 +502,7 @@ class VectorStoreTable(Protocol):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class VectorIO(Protocol):
    vector_store_table: VectorStoreTable | None = None

--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -18,14 +18,21 @@ from typing import Any, TypeVar, Union, get_args, get_origin
 import httpx
 import yaml
 from fastapi import Response as FastAPIResponse
-from llama_stack_client import (
-    NOT_GIVEN,
-    APIResponse,
-    AsyncAPIResponse,
-    AsyncLlamaStackClient,
-    AsyncStream,
-    LlamaStackClient,
-)
+
+try:
+    from llama_stack_client import (
+        NOT_GIVEN,
+        APIResponse,
+        AsyncAPIResponse,
+        AsyncLlamaStackClient,
+        AsyncStream,
+        LlamaStackClient,
+    )
+except ImportError as e:
+    raise ImportError(
+        "llama-stack-client is not installed. Please install it with `uv pip install llama-stack[client]`."
+    ) from e
+
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -397,6 +397,18 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config

+    # Apply tracing if telemetry is enabled and any base class has __marked_for_tracing__ marker
+    if run_config.telemetry.enabled:
+        traced_classes = [
+            base for base in reversed(impl.__class__.__mro__) if getattr(base, "__marked_for_tracing__", False)
+        ]
+
+        if traced_classes:
+            from llama_stack.core.telemetry.trace_protocol import trace_protocol
+
+            for cls in traced_classes:
+                trace_protocol(cls)
+
    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -45,6 +45,7 @@ async def get_routing_table_impl(
        raise ValueError(f"API {api.value} not found in router map")

    impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy)
+
    await impl.initialize()
    return impl

@ -92,5 +93,6 @@ async def get_auto_router_impl(
        api_to_dep_impl["safety_config"] = run_config.safety

    impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
+
    await impl.initialize()
    return impl
--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -190,7 +190,7 @@ class InferenceRouter(Inference):

        response = await provider.openai_completion(params)
        response.model = request_model_id
-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -253,7 +253,7 @@ class InferenceRouter(Inference):
        if self.store:
            asyncio.create_task(self.store.store_chat_completion(response, params.messages))

-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -20,6 +20,8 @@ from llama_stack.apis.vector_io import (
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
    VectorStoreFileContentsResponse,
@ -167,6 +169,13 @@ class VectorIORouter(VectorIO):
        if embedding_dimension is not None:
            params.model_extra["embedding_dimension"] = embedding_dimension

+        # Set chunking strategy explicitly if not provided
+        if params.chunking_strategy is None or params.chunking_strategy.type == "auto":
+            # actualize the chunking strategy to static
+            params.chunking_strategy = VectorStoreChunkingStrategyStatic(
+                static=VectorStoreChunkingStrategyStaticConfig()
+            )
+
        return await provider.openai_create_vector_store(params)

    async def openai_list_vector_stores(
@ -283,6 +292,8 @@ class VectorIORouter(VectorIO):
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
+        if chunking_strategy is None or chunking_strategy.type == "auto":
+            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -163,47 +163,6 @@ class MetricEvent(EventCommon):
    unit: str


-@json_schema_type
-class MetricInResponse(BaseModel):
-    """A metric value included in API responses.
-    :param metric: The name of the metric
-    :param value: The numeric value of the metric
-    :param unit: (Optional) The unit of measurement for the metric value
-    """
-
-    metric: str
-    value: int | float
-    unit: str | None = None
-
-
-# This is a short term solution to allow inference API to return metrics
-# The ideal way to do this is to have a way for all response types to include metrics
-# and all metric events logged to the telemetry API to be included with the response
-# To do this, we will need to augment all response types with a metrics field.
-# We have hit a blocker from stainless SDK that prevents us from doing this.
-# The blocker is that if we were to augment the response types that have a data field
-# in them like so
-# class ListModelsResponse(BaseModel):
-# metrics: Optional[List[MetricEvent]] = None
-# data: List[Models]
-# ...
-# The client SDK will need to access the data by using a .data field, which is not
-# ergonomic. Stainless SDK does support unwrapping the response type, but it
-# requires that the response type to only have a single field.
-
-# We will need a way in the client SDK to signal that the metrics are needed
-# and if they are needed, the client SDK has to return the full response type
-# without unwrapping it.
-
-
-class MetricResponseMixin(BaseModel):
-    """Mixin class for API responses that can include metrics.
-    :param metrics: (Optional) List of metrics associated with the API response
-    """
-
-    metrics: list[MetricInResponse] | None = None
-
-
@json_schema_type
 class StructuredLogType(Enum):
    """The type of structured log event payload.
--- a/src/llama_stack/core/telemetry/trace_protocol.py
+++ b/src/llama_stack/core/telemetry/trace_protocol.py
@ -129,6 +129,15 @@ def trace_protocol[T: type[Any]](cls: T) -> T:
        else:
            return sync_wrapper

+    # Wrap methods on the class itself (for classes applied at runtime)
+    # Skip if already wrapped (indicated by __wrapped__ attribute)
+    for name, method in vars(cls).items():
+        if inspect.isfunction(method) and not name.startswith("_"):
+            if not hasattr(method, "__wrapped__"):
+                wrapped = trace_method(method)
+                setattr(cls, name, wrapped)  # noqa: B010
+
+    # Also set up __init_subclass__ for future subclasses
    original_init_subclass = cast(Callable[..., Any] | None, getattr(cls, "__init_subclass__", None))

    def __init_subclass__(cls_child: type[Any], **kwargs: Any) -> None:  # noqa: N807
--- a/src/llama_stack/core/ui/Containerfile
+++ b/src/llama_stack/core/ui/Containerfile
@ -1,11 +0,0 @@
-# More info on playground configuration can be found here:
-# https://llama-stack.readthedocs.io/en/latest/playground
-
-FROM python:3.12-slim
-WORKDIR /app
-COPY . /app/
-RUN /usr/local/bin/python -m pip install --upgrade pip && \
-    /usr/local/bin/pip3 install -r requirements.txt
-EXPOSE 8501
-
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
--- a/src/llama_stack/core/ui/README.md
+++ b/src/llama_stack/core/ui/README.md
@ -1,50 +0,0 @@
-# (Experimental) LLama Stack UI
-
-## Docker Setup
-
-:warning: This is a work in progress.
-
-## Developer Setup
-
-1. Start up Llama Stack API server. More details [here](https://llamastack.github.io/latest/getting_started/index.htmll).
-
-```
-llama stack list-deps together | xargs -L1 uv pip install
-
-llama stack run together
-```
-
-2. (Optional) Register datasets and eval tasks as resources. If you want to run pre-configured evaluation flows (e.g. Evaluations (Generation + Scoring) Page).
-
-```bash
-llama-stack-client datasets register \
--dataset-id "mmlu" \
--provider-id "huggingface" \
--url "https://huggingface.co/datasets/llamastack/evals" \
--metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
--schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}'
-```
-
-```bash
-llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \
--scoring-functions basic::regex_parser_multiple_choice_answer
-```
-
-3. Start Streamlit UI
-
-```bash
-uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
-```
-
-## Environment Variables
-
-| Environment Variable       | Description                        | Default Value             |
-|----------------------------|------------------------------------|---------------------------|
-| LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
-| FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
-| TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
-| SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
-| OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
--- a/src/llama_stack/core/ui/init.py
+++ b/src/llama_stack/core/ui/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/app.py
+++ b/src/llama_stack/core/ui/app.py
@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import streamlit as st
-
-
-def main():
-    # Evaluation pages
-    application_evaluation_page = st.Page(
-        "page/evaluations/app_eval.py",
-        title="Evaluations (Scoring)",
-        icon="📊",
-        default=False,
-    )
-    native_evaluation_page = st.Page(
-        "page/evaluations/native_eval.py",
-        title="Evaluations (Generation + Scoring)",
-        icon="📊",
-        default=False,
-    )
-
-    # Playground pages
-    chat_page = st.Page("page/playground/chat.py", title="Chat", icon="💬", default=True)
-    rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False)
-    tool_page = st.Page("page/playground/tools.py", title="Tools", icon="🛠", default=False)
-
-    # Distribution pages
-    resources_page = st.Page("page/distribution/resources.py", title="Resources", icon="🔍", default=False)
-    provider_page = st.Page(
-        "page/distribution/providers.py",
-        title="API Providers",
-        icon="🔍",
-        default=False,
-    )
-
-    pg = st.navigation(
-        {
-            "Playground": [
-                chat_page,
-                rag_page,
-                tool_page,
-                application_evaluation_page,
-                native_evaluation_page,
-            ],
-            "Inspect": [provider_page, resources_page],
-        },
-        expanded=False,
-    )
-    pg.run()
-
-
-if __name__ == "__main__":
-    main()
--- a/src/llama_stack/core/ui/modules/init.py
+++ b/src/llama_stack/core/ui/modules/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/modules/api.py
+++ b/src/llama_stack/core/ui/modules/api.py
@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-from llama_stack_client import LlamaStackClient
-
-
-class LlamaStackApi:
-    def __init__(self):
-        self.client = LlamaStackClient(
-            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"),
-            provider_data={
-                "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
-                "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
-                "sambanova_api_key": os.environ.get("SAMBANOVA_API_KEY", ""),
-                "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
-                "tavily_search_api_key": os.environ.get("TAVILY_SEARCH_API_KEY", ""),
-            },
-        )
-
-    def run_scoring(self, row, scoring_function_ids: list[str], scoring_params: dict | None):
-        """Run scoring on a single row"""
-        if not scoring_params:
-            scoring_params = dict.fromkeys(scoring_function_ids)
-        return self.client.scoring.score(input_rows=[row], scoring_functions=scoring_params)
-
-
-llama_stack_api = LlamaStackApi()
--- a/src/llama_stack/core/ui/modules/utils.py
+++ b/src/llama_stack/core/ui/modules/utils.py
@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import os
-
-import pandas as pd
-import streamlit as st
-
-
-def process_dataset(file):
-    if file is None:
-        return "No file uploaded", None
-
-    try:
-        # Determine file type and read accordingly
-        file_ext = os.path.splitext(file.name)[1].lower()
-        if file_ext == ".csv":
-            df = pd.read_csv(file)
-        elif file_ext in [".xlsx", ".xls"]:
-            df = pd.read_excel(file)
-        else:
-            return "Unsupported file format. Please upload a CSV or Excel file.", None
-
-        return df
-
-    except Exception as e:
-        st.error(f"Error processing file: {str(e)}")
-        return None
-
-
-def data_url_from_file(file) -> str:
-    file_content = file.getvalue()
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type = file.type
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
--- a/src/llama_stack/core/ui/page/init.py
+++ b/src/llama_stack/core/ui/page/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/distribution/init.py
+++ b/src/llama_stack/core/ui/page/distribution/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/distribution/datasets.py
+++ b/src/llama_stack/core/ui/page/distribution/datasets.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def datasets():
-    st.header("Datasets")
-
-    datasets_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.datasets.list()}
-    if len(datasets_info) > 0:
-        selected_dataset = st.selectbox("Select a dataset", list(datasets_info.keys()))
-        st.json(datasets_info[selected_dataset], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/eval_tasks.py
+++ b/src/llama_stack/core/ui/page/distribution/eval_tasks.py
@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def benchmarks():
-    # Benchmarks Section
-    st.header("Benchmarks")
-
-    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
-
-    if len(benchmarks_info) > 0:
-        selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
-        st.json(benchmarks_info[selected_benchmark], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/models.py
+++ b/src/llama_stack/core/ui/page/distribution/models.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def models():
-    # Models Section
-    st.header("Models")
-    models_info = {m.id: m.model_dump() for m in llama_stack_api.client.models.list()}
-
-    selected_model = st.selectbox("Select a model", list(models_info.keys()))
-    st.json(models_info[selected_model])
--- a/src/llama_stack/core/ui/page/distribution/providers.py
+++ b/src/llama_stack/core/ui/page/distribution/providers.py
@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def providers():
-    st.header("🔍 API Providers")
-    apis_providers_lst = llama_stack_api.client.providers.list()
-    api_to_providers = {}
-    for api_provider in apis_providers_lst:
-        if api_provider.api in api_to_providers:
-            api_to_providers[api_provider.api].append(api_provider)
-        else:
-            api_to_providers[api_provider.api] = [api_provider]
-
-    for api in api_to_providers.keys():
-        st.markdown(f"###### {api}")
-        st.dataframe([x.to_dict() for x in api_to_providers[api]], width=500)
-
-
-providers()
--- a/src/llama_stack/core/ui/page/distribution/resources.py
+++ b/src/llama_stack/core/ui/page/distribution/resources.py
@ -1,48 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from streamlit_option_menu import option_menu
-
-from llama_stack.core.ui.page.distribution.datasets import datasets
-from llama_stack.core.ui.page.distribution.eval_tasks import benchmarks
-from llama_stack.core.ui.page.distribution.models import models
-from llama_stack.core.ui.page.distribution.scoring_functions import scoring_functions
-from llama_stack.core.ui.page.distribution.shields import shields
-
-
-def resources_page():
-    options = [
-        "Models",
-        "Shields",
-        "Scoring Functions",
-        "Datasets",
-        "Benchmarks",
-    ]
-    icons = ["magic", "shield", "file-bar-graph", "database", "list-task"]
-    selected_resource = option_menu(
-        None,
-        options,
-        icons=icons,
-        orientation="horizontal",
-        styles={
-            "nav-link": {
-                "font-size": "12px",
-            },
-        },
-    )
-    if selected_resource == "Benchmarks":
-        benchmarks()
-    elif selected_resource == "Datasets":
-        datasets()
-    elif selected_resource == "Models":
-        models()
-    elif selected_resource == "Scoring Functions":
-        scoring_functions()
-    elif selected_resource == "Shields":
-        shields()
-
-
-resources_page()
--- a/src/llama_stack/core/ui/page/distribution/scoring_functions.py
+++ b/src/llama_stack/core/ui/page/distribution/scoring_functions.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def scoring_functions():
-    st.header("Scoring Functions")
-
-    scoring_functions_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.scoring_functions.list()}
-
-    selected_scoring_function = st.selectbox("Select a scoring function", list(scoring_functions_info.keys()))
-    st.json(scoring_functions_info[selected_scoring_function], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/shields.py
+++ b/src/llama_stack/core/ui/page/distribution/shields.py
@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def shields():
-    # Shields Section
-    st.header("Shields")
-
-    shields_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.shields.list()}
-
-    selected_shield = st.selectbox("Select a shield", list(shields_info.keys()))
-    st.json(shields_info[selected_shield])
--- a/src/llama_stack/core/ui/page/evaluations/init.py
+++ b/src/llama_stack/core/ui/page/evaluations/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/evaluations/app_eval.py
+++ b/src/llama_stack/core/ui/page/evaluations/app_eval.py
@ -1,143 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-
-import pandas as pd
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-from llama_stack.core.ui.modules.utils import process_dataset
-
-
-def application_evaluation_page():
-    st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙")
-    st.title("📊 Evaluations (Scoring)")
-
-    # File uploader
-    uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
-
-    if uploaded_file is None:
-        st.error("No file uploaded")
-        return
-
-    # Process uploaded file
-    df = process_dataset(uploaded_file)
-    if df is None:
-        st.error("Error processing file")
-        return
-
-    # Display dataset information
-    st.success("Dataset loaded successfully!")
-
-    # Display dataframe preview
-    st.subheader("Dataset Preview")
-    st.dataframe(df)
-
-    # Select Scoring Functions to Run Evaluation On
-    st.subheader("Select Scoring Functions")
-    scoring_functions = llama_stack_api.client.scoring_functions.list()
-    scoring_functions = {sf.identifier: sf for sf in scoring_functions}
-    scoring_functions_names = list(scoring_functions.keys())
-    selected_scoring_functions = st.multiselect(
-        "Choose one or more scoring functions",
-        options=scoring_functions_names,
-        help="Choose one or more scoring functions.",
-    )
-
-    available_models = llama_stack_api.client.models.list()
-    available_models = [m.identifier for m in available_models]
-
-    scoring_params = {}
-    if selected_scoring_functions:
-        st.write("Selected:")
-        for scoring_fn_id in selected_scoring_functions:
-            scoring_fn = scoring_functions[scoring_fn_id]
-            st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
-            new_params = None
-            if scoring_fn.params:
-                new_params = {}
-                for param_name, param_value in scoring_fn.params.to_dict().items():
-                    if param_name == "type":
-                        new_params[param_name] = param_value
-                        continue
-
-                    if param_name == "judge_model":
-                        value = st.selectbox(
-                            f"Select **{param_name}** for {scoring_fn_id}",
-                            options=available_models,
-                            index=0,
-                            key=f"{scoring_fn_id}_{param_name}",
-                        )
-                        new_params[param_name] = value
-                    else:
-                        value = st.text_area(
-                            f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
-                            value=json.dumps(param_value, indent=2),
-                            height=80,
-                        )
-                        try:
-                            new_params[param_name] = json.loads(value)
-                        except json.JSONDecodeError:
-                            st.error(f"Invalid JSON for **{param_name}** in {scoring_fn_id}")
-
-                st.json(new_params)
-            scoring_params[scoring_fn_id] = new_params
-
-        # Add run evaluation button & slider
-        total_rows = len(df)
-        num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
-
-        if st.button("Run Evaluation"):
-            progress_text = "Running evaluation..."
-            progress_bar = st.progress(0, text=progress_text)
-            rows = df.to_dict(orient="records")
-            if num_rows < total_rows:
-                rows = rows[:num_rows]
-
-            # Create separate containers for progress text and results
-            progress_text_container = st.empty()
-            results_container = st.empty()
-            output_res = {}
-            for i, r in enumerate(rows):
-                # Update progress
-                progress = i / len(rows)
-                progress_bar.progress(progress, text=progress_text)
-
-                # Run evaluation for current row
-                score_res = llama_stack_api.run_scoring(
-                    r,
-                    scoring_function_ids=selected_scoring_functions,
-                    scoring_params=scoring_params,
-                )
-
-                for k in r.keys():
-                    if k not in output_res:
-                        output_res[k] = []
-                    output_res[k].append(r[k])
-
-                for fn_id in selected_scoring_functions:
-                    if fn_id not in output_res:
-                        output_res[fn_id] = []
-                    output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
-
-                # Display current row results using separate containers
-                progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
-                results_container.json(
-                    score_res.to_json(),
-                    expanded=2,
-                )
-
-            progress_bar.progress(1.0, text="Evaluation complete!")
-
-            # Display results in dataframe
-            if output_res:
-                output_df = pd.DataFrame(output_res)
-                st.subheader("Evaluation Results")
-                st.dataframe(output_df)
-
-
-application_evaluation_page()
--- a/src/llama_stack/core/ui/page/evaluations/native_eval.py
+++ b/src/llama_stack/core/ui/page/evaluations/native_eval.py
@ -1,253 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-
-import pandas as pd
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def select_benchmark_1():
-    # Select Benchmarks
-    st.subheader("1. Choose An Eval Task")
-    benchmarks = llama_stack_api.client.benchmarks.list()
-    benchmarks = {et.identifier: et for et in benchmarks}
-    benchmarks_names = list(benchmarks.keys())
-    selected_benchmark = st.selectbox(
-        "Choose an eval task.",
-        options=benchmarks_names,
-        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
-    )
-    with st.expander("View Eval Task"):
-        st.json(benchmarks[selected_benchmark], expanded=True)
-
-    st.session_state["selected_benchmark"] = selected_benchmark
-    st.session_state["benchmarks"] = benchmarks
-    if st.button("Confirm", key="confirm_1"):
-        st.session_state["selected_benchmark_1_next"] = True
-
-
-def define_eval_candidate_2():
-    if not st.session_state.get("selected_benchmark_1_next", None):
-        return
-
-    st.subheader("2. Define Eval Candidate")
-    st.info(
-        """
-        Define the configurations for the evaluation candidate model or agent used for generation.
-        Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
-        """
-    )
-    with st.expander("Define Eval Candidate", expanded=True):
-        # Define Eval Candidate
-        candidate_type = st.radio("Candidate Type", ["model", "agent"])
-
-        available_models = llama_stack_api.client.models.list()
-        available_models = [model.identifier for model in available_models]
-        selected_model = st.selectbox(
-            "Choose a model",
-            available_models,
-            index=0,
-        )
-
-        # Sampling Parameters
-        st.markdown("##### Sampling Parameters")
-        temperature = st.slider(
-            "Temperature",
-            min_value=0.0,
-            max_value=1.0,
-            value=0.0,
-            step=0.1,
-            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
-        )
-        top_p = st.slider(
-            "Top P",
-            min_value=0.0,
-            max_value=1.0,
-            value=0.95,
-            step=0.1,
-        )
-        max_tokens = st.slider(
-            "Max Tokens",
-            min_value=0,
-            max_value=4096,
-            value=512,
-            step=1,
-            help="The maximum number of tokens to generate",
-        )
-        repetition_penalty = st.slider(
-            "Repetition Penalty",
-            min_value=1.0,
-            max_value=2.0,
-            value=1.0,
-            step=0.1,
-            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
-        )
-        if candidate_type == "model":
-            if temperature > 0.0:
-                strategy = {
-                    "type": "top_p",
-                    "temperature": temperature,
-                    "top_p": top_p,
-                }
-            else:
-                strategy = {"type": "greedy"}
-
-            eval_candidate = {
-                "type": "model",
-                "model": selected_model,
-                "sampling_params": {
-                    "strategy": strategy,
-                    "max_tokens": max_tokens,
-                    "repetition_penalty": repetition_penalty,
-                },
-            }
-        elif candidate_type == "agent":
-            system_prompt = st.text_area(
-                "System Prompt",
-                value="You are a helpful AI assistant.",
-                help="Initial instructions given to the AI to set its behavior and context",
-            )
-            tools_json = st.text_area(
-                "Tools Configuration (JSON)",
-                value=json.dumps(
-                    [
-                        {
-                            "type": "brave_search",
-                            "engine": "brave",
-                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
-                        }
-                    ]
-                ),
-                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
-                height=200,
-            )
-            try:
-                tools = json.loads(tools_json)
-            except json.JSONDecodeError:
-                st.error("Invalid JSON format for tools configuration")
-                tools = []
-            eval_candidate = {
-                "type": "agent",
-                "config": {
-                    "model": selected_model,
-                    "instructions": system_prompt,
-                    "tools": tools,
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                    "input_shields": [],
-                    "output_shields": [],
-                    "enable_session_persistence": False,
-                },
-            }
-        st.session_state["eval_candidate"] = eval_candidate
-
-    if st.button("Confirm", key="confirm_2"):
-        st.session_state["selected_eval_candidate_2_next"] = True
-
-
-def run_evaluation_3():
-    if not st.session_state.get("selected_eval_candidate_2_next", None):
-        return
-
-    st.subheader("3. Run Evaluation")
-    # Add info box to explain configurations being used
-    st.info(
-        """
-        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
-        """
-    )
-    selected_benchmark = st.session_state["selected_benchmark"]
-    benchmarks = st.session_state["benchmarks"]
-    eval_candidate = st.session_state["eval_candidate"]
-
-    dataset_id = benchmarks[selected_benchmark].dataset_id
-    rows = llama_stack_api.client.datasets.iterrows(
-        dataset_id=dataset_id,
-    )
-    total_rows = len(rows.data)
-    # Add number of examples control
-    num_rows = st.number_input(
-        "Number of Examples to Evaluate",
-        min_value=1,
-        max_value=total_rows,
-        value=5,
-        help="Number of examples from the dataset to evaluate. ",
-    )
-
-    benchmark_config = {
-        "type": "benchmark",
-        "eval_candidate": eval_candidate,
-        "scoring_params": {},
-    }
-
-    with st.expander("View Evaluation Task", expanded=True):
-        st.json(benchmarks[selected_benchmark], expanded=True)
-    with st.expander("View Evaluation Task Configuration", expanded=True):
-        st.json(benchmark_config, expanded=True)
-
-    # Add run button and handle evaluation
-    if st.button("Run Evaluation"):
-        progress_text = "Running evaluation..."
-        progress_bar = st.progress(0, text=progress_text)
-        rows = rows.data
-        if num_rows < total_rows:
-            rows = rows[:num_rows]
-
-        # Create separate containers for progress text and results
-        progress_text_container = st.empty()
-        results_container = st.empty()
-        output_res = {}
-        for i, r in enumerate(rows):
-            # Update progress
-            progress = i / len(rows)
-            progress_bar.progress(progress, text=progress_text)
-            # Run evaluation for current row
-            eval_res = llama_stack_api.client.eval.evaluate_rows(
-                benchmark_id=selected_benchmark,
-                input_rows=[r],
-                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
-                benchmark_config=benchmark_config,
-            )
-
-            for k in r.keys():
-                if k not in output_res:
-                    output_res[k] = []
-                output_res[k].append(r[k])
-
-            for k in eval_res.generations[0].keys():
-                if k not in output_res:
-                    output_res[k] = []
-                output_res[k].append(eval_res.generations[0][k])
-
-            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
-                if scoring_fn not in output_res:
-                    output_res[scoring_fn] = []
-                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
-
-            progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
-            results_container.json(eval_res, expanded=2)
-
-        progress_bar.progress(1.0, text="Evaluation complete!")
-        # Display results in dataframe
-        if output_res:
-            output_df = pd.DataFrame(output_res)
-            st.subheader("Evaluation Results")
-            st.dataframe(output_df)
-
-
-def native_evaluation_page():
-    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
-    st.title("📊 Evaluations (Generation + Scoring)")
-
-    select_benchmark_1()
-    define_eval_candidate_2()
-    run_evaluation_3()
-
-
-native_evaluation_page()
--- a/src/llama_stack/core/ui/page/playground/init.py
+++ b/src/llama_stack/core/ui/page/playground/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/playground/chat.py
+++ b/src/llama_stack/core/ui/page/playground/chat.py
@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-# Sidebar configurations
-with st.sidebar:
-    st.header("Configuration")
-    available_models = llama_stack_api.client.models.list()
-    available_models = [
-        model.id
-        for model in available_models
-        if model.custom_metadata and model.custom_metadata.get("model_type") == "llm"
-    ]
-    selected_model = st.selectbox(
-        "Choose a model",
-        available_models,
-        index=0,
-    )
-
-    temperature = st.slider(
-        "Temperature",
-        min_value=0.0,
-        max_value=1.0,
-        value=0.0,
-        step=0.1,
-        help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
-    )
-
-    top_p = st.slider(
-        "Top P",
-        min_value=0.0,
-        max_value=1.0,
-        value=0.95,
-        step=0.1,
-    )
-
-    max_tokens = st.slider(
-        "Max Tokens",
-        min_value=0,
-        max_value=4096,
-        value=512,
-        step=1,
-        help="The maximum number of tokens to generate",
-    )
-
-    repetition_penalty = st.slider(
-        "Repetition Penalty",
-        min_value=1.0,
-        max_value=2.0,
-        value=1.0,
-        step=0.1,
-        help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
-    )
-
-    stream = st.checkbox("Stream", value=True)
-    system_prompt = st.text_area(
-        "System Prompt",
-        value="You are a helpful AI assistant.",
-        help="Initial instructions given to the AI to set its behavior and context",
-    )
-
-    # Add clear chat button to sidebar
-    if st.button("Clear Chat", use_container_width=True):
-        st.session_state.messages = []
-        st.rerun()
-
-
-# Main chat interface
-st.title("🦙 Chat")
-
-
-# Initialize chat history
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-
-# Display chat messages
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.markdown(message["content"])
-
-# Chat input
-if prompt := st.chat_input("Example: What is Llama Stack?"):
-    # Add user message to chat history
-    st.session_state.messages.append({"role": "user", "content": prompt})
-
-    # Display user message
-    with st.chat_message("user"):
-        st.markdown(prompt)
-
-    # Display assistant response
-    with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        full_response = ""
-
-        if temperature > 0.0:
-            strategy = {
-                "type": "top_p",
-                "temperature": temperature,
-                "top_p": top_p,
-            }
-        else:
-            strategy = {"type": "greedy"}
-
-        response = llama_stack_api.client.inference.chat_completion(
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            model_id=selected_model,
-            stream=stream,
-            sampling_params={
-                "strategy": strategy,
-                "max_tokens": max_tokens,
-                "repetition_penalty": repetition_penalty,
-            },
-        )
-
-        if stream:
-            for chunk in response:
-                if chunk.event.event_type == "progress":
-                    full_response += chunk.event.delta.text
-                message_placeholder.markdown(full_response + "▌")
-            message_placeholder.markdown(full_response)
-        else:
-            full_response = response.completion_message.content
-            message_placeholder.markdown(full_response)
-
-        st.session_state.messages.append({"role": "assistant", "content": full_response})
--- a/src/llama_stack/core/ui/page/playground/tools.py
+++ b/src/llama_stack/core/ui/page/playground/tools.py
@ -1,352 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import enum
-import json
-import uuid
-
-import streamlit as st
-from llama_stack_client import Agent
-from llama_stack_client.lib.agents.react.agent import ReActAgent
-from llama_stack_client.lib.agents.react.tool_parser import ReActOutput
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-class AgentType(enum.Enum):
-    REGULAR = "Regular"
-    REACT = "ReAct"
-
-
-def tool_chat_page():
-    st.title("🛠 Tools")
-
-    client = llama_stack_api.client
-    models = client.models.list()
-    model_list = [model.identifier for model in models if model.api_model_type == "llm"]
-
-    tool_groups = client.toolgroups.list()
-    tool_groups_list = [tool_group.identifier for tool_group in tool_groups]
-    mcp_tools_list = [tool for tool in tool_groups_list if tool.startswith("mcp::")]
-    builtin_tools_list = [tool for tool in tool_groups_list if not tool.startswith("mcp::")]
-    selected_vector_stores = []
-
-    def reset_agent():
-        st.session_state.clear()
-        st.cache_resource.clear()
-
-    with st.sidebar:
-        st.title("Configuration")
-        st.subheader("Model")
-        model = st.selectbox(label="Model", options=model_list, on_change=reset_agent, label_visibility="collapsed")
-
-        st.subheader("Available ToolGroups")
-
-        toolgroup_selection = st.pills(
-            label="Built-in tools",
-            options=builtin_tools_list,
-            selection_mode="multi",
-            on_change=reset_agent,
-            format_func=lambda tool: "".join(tool.split("::")[1:]),
-            help="List of built-in tools from your llama stack server.",
-        )
-
-        if "builtin::rag" in toolgroup_selection:
-            vector_stores = llama_stack_api.client.vector_stores.list() or []
-            if not vector_stores:
-                st.info("No vector databases available for selection.")
-            vector_stores = [vector_store.identifier for vector_store in vector_stores]
-            selected_vector_stores = st.multiselect(
-                label="Select Document Collections to use in RAG queries",
-                options=vector_stores,
-                on_change=reset_agent,
-            )
-
-        mcp_selection = st.pills(
-            label="MCP Servers",
-            options=mcp_tools_list,
-            selection_mode="multi",
-            on_change=reset_agent,
-            format_func=lambda tool: "".join(tool.split("::")[1:]),
-            help="List of MCP servers registered to your llama stack server.",
-        )
-
-        toolgroup_selection.extend(mcp_selection)
-
-        grouped_tools = {}
-        total_tools = 0
-
-        for toolgroup_id in toolgroup_selection:
-            tools = client.tools.list(toolgroup_id=toolgroup_id)
-            grouped_tools[toolgroup_id] = [tool.name for tool in tools]
-            total_tools += len(tools)
-
-        st.markdown(f"Active Tools: 🛠 {total_tools}")
-
-        for group_id, tools in grouped_tools.items():
-            with st.expander(f"🔧 Tools from `{group_id}`"):
-                for idx, tool in enumerate(tools, start=1):
-                    st.markdown(f"{idx}. `{tool.split(':')[-1]}`")
-
-        st.subheader("Agent Configurations")
-        st.subheader("Agent Type")
-        agent_type = st.radio(
-            label="Select Agent Type",
-            options=["Regular", "ReAct"],
-            on_change=reset_agent,
-        )
-
-        if agent_type == "ReAct":
-            agent_type = AgentType.REACT
-        else:
-            agent_type = AgentType.REGULAR
-
-        max_tokens = st.slider(
-            "Max Tokens",
-            min_value=0,
-            max_value=4096,
-            value=512,
-            step=64,
-            help="The maximum number of tokens to generate",
-            on_change=reset_agent,
-        )
-
-    for i, tool_name in enumerate(toolgroup_selection):
-        if tool_name == "builtin::rag":
-            tool_dict = dict(
-                name="builtin::rag",
-                args={
-                    "vector_store_ids": list(selected_vector_stores),
-                },
-            )
-            toolgroup_selection[i] = tool_dict
-
-    @st.cache_resource
-    def create_agent():
-        if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT:
-            return ReActAgent(
-                client=client,
-                model=model,
-                tools=toolgroup_selection,
-                response_format={
-                    "type": "json_schema",
-                    "json_schema": ReActOutput.model_json_schema(),
-                },
-                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
-            )
-        else:
-            return Agent(
-                client,
-                model=model,
-                instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
-                tools=toolgroup_selection,
-                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
-            )
-
-    st.session_state.agent_type = agent_type
-
-    agent = create_agent()
-
-    if "agent_session_id" not in st.session_state:
-        st.session_state["agent_session_id"] = agent.create_session(session_name=f"tool_demo_{uuid.uuid4()}")
-
-    session_id = st.session_state["agent_session_id"]
-
-    if "messages" not in st.session_state:
-        st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
-
-    for msg in st.session_state.messages:
-        with st.chat_message(msg["role"]):
-            st.markdown(msg["content"])
-
-    if prompt := st.chat_input(placeholder=""):
-        with st.chat_message("user"):
-            st.markdown(prompt)
-
-        st.session_state.messages.append({"role": "user", "content": prompt})
-
-        turn_response = agent.create_turn(
-            session_id=session_id,
-            messages=[{"role": "user", "content": prompt}],
-            stream=True,
-        )
-
-        def response_generator(turn_response):
-            if st.session_state.get("agent_type") == AgentType.REACT:
-                return _handle_react_response(turn_response)
-            else:
-                return _handle_regular_response(turn_response)
-
-        def _handle_react_response(turn_response):
-            current_step_content = ""
-            final_answer = None
-            tool_results = []
-
-            for response in turn_response:
-                if not hasattr(response.event, "payload"):
-                    yield (
-                        "\n\n🚨 :red[_Llama Stack server Error:_]\n"
-                        "The response received is missing an expected `payload` attribute.\n"
-                        "This could indicate a malformed response or an internal issue within the server.\n\n"
-                        f"Error details: {response}"
-                    )
-                    return
-
-                payload = response.event.payload
-
-                if payload.event_type == "step_progress" and hasattr(payload.delta, "text"):
-                    current_step_content += payload.delta.text
-                    continue
-
-                if payload.event_type == "step_complete":
-                    step_details = payload.step_details
-
-                    if step_details.step_type == "inference":
-                        yield from _process_inference_step(current_step_content, tool_results, final_answer)
-                        current_step_content = ""
-                    elif step_details.step_type == "tool_execution":
-                        tool_results = _process_tool_execution(step_details, tool_results)
-                        current_step_content = ""
-                    else:
-                        current_step_content = ""
-
-            if not final_answer and tool_results:
-                yield from _format_tool_results_summary(tool_results)
-
-        def _process_inference_step(current_step_content, tool_results, final_answer):
-            try:
-                react_output_data = json.loads(current_step_content)
-                thought = react_output_data.get("thought")
-                action = react_output_data.get("action")
-                answer = react_output_data.get("answer")
-
-                if answer and answer != "null" and answer is not None:
-                    final_answer = answer
-
-                if thought:
-                    with st.expander("🤔 Thinking...", expanded=False):
-                        st.markdown(f":grey[__{thought}__]")
-
-                if action and isinstance(action, dict):
-                    tool_name = action.get("tool_name")
-                    tool_params = action.get("tool_params")
-                    with st.expander(f'🛠 Action: Using tool "{tool_name}"', expanded=False):
-                        st.json(tool_params)
-
-                if answer and answer != "null" and answer is not None:
-                    yield f"\n\n✅ **Final Answer:**\n{answer}"
-
-            except json.JSONDecodeError:
-                yield f"\n\nFailed to parse ReAct step content:\n```json\n{current_step_content}\n```"
-            except Exception as e:
-                yield f"\n\nFailed to process ReAct step: {e}\n```json\n{current_step_content}\n```"
-
-            return final_answer
-
-        def _process_tool_execution(step_details, tool_results):
-            try:
-                if hasattr(step_details, "tool_responses") and step_details.tool_responses:
-                    for tool_response in step_details.tool_responses:
-                        tool_name = tool_response.tool_name
-                        content = tool_response.content
-                        tool_results.append((tool_name, content))
-                        with st.expander(f'⚙️ Observation (Result from "{tool_name}")', expanded=False):
-                            try:
-                                parsed_content = json.loads(content)
-                                st.json(parsed_content)
-                            except json.JSONDecodeError:
-                                st.code(content, language=None)
-                else:
-                    with st.expander("⚙️ Observation", expanded=False):
-                        st.markdown(":grey[_Tool execution step completed, but no response data found._]")
-            except Exception as e:
-                with st.expander("⚙️ Error in Tool Execution", expanded=False):
-                    st.markdown(f":red[_Error processing tool execution: {str(e)}_]")
-
-            return tool_results
-
-        def _format_tool_results_summary(tool_results):
-            yield "\n\n**Here's what I found:**\n"
-            for tool_name, content in tool_results:
-                try:
-                    parsed_content = json.loads(content)
-
-                    if tool_name == "web_search" and "top_k" in parsed_content:
-                        yield from _format_web_search_results(parsed_content)
-                    elif "results" in parsed_content and isinstance(parsed_content["results"], list):
-                        yield from _format_results_list(parsed_content["results"])
-                    elif isinstance(parsed_content, dict) and len(parsed_content) > 0:
-                        yield from _format_dict_results(parsed_content)
-                    elif isinstance(parsed_content, list) and len(parsed_content) > 0:
-                        yield from _format_list_results(parsed_content)
-                except json.JSONDecodeError:
-                    yield f"\n**{tool_name}** was used but returned complex data. Check the observation for details.\n"
-                except (TypeError, AttributeError, KeyError, IndexError) as e:
-                    print(f"Error processing {tool_name} result: {type(e).__name__}: {e}")
-
-        def _format_web_search_results(parsed_content):
-            for i, result in enumerate(parsed_content["top_k"], 1):
-                if i <= 3:
-                    title = result.get("title", "Untitled")
-                    url = result.get("url", "")
-                    content_text = result.get("content", "").strip()
-                    yield f"\n- **{title}**\n  {content_text}\n  [Source]({url})\n"
-
-        def _format_results_list(results):
-            for i, result in enumerate(results, 1):
-                if i <= 3:
-                    if isinstance(result, dict):
-                        name = result.get("name", result.get("title", "Result " + str(i)))
-                        description = result.get("description", result.get("content", result.get("summary", "")))
-                        yield f"\n- **{name}**\n  {description}\n"
-                    else:
-                        yield f"\n- {result}\n"
-
-        def _format_dict_results(parsed_content):
-            yield "\n```\n"
-            for key, value in list(parsed_content.items())[:5]:
-                if isinstance(value, str) and len(value) < 100:
-                    yield f"{key}: {value}\n"
-                else:
-                    yield f"{key}: [Complex data]\n"
-            yield "```\n"
-
-        def _format_list_results(parsed_content):
-            yield "\n"
-            for _, item in enumerate(parsed_content[:3], 1):
-                if isinstance(item, str):
-                    yield f"- {item}\n"
-                elif isinstance(item, dict) and "text" in item:
-                    yield f"- {item['text']}\n"
-                elif isinstance(item, dict) and len(item) > 0:
-                    first_value = next(iter(item.values()))
-                    if isinstance(first_value, str) and len(first_value) < 100:
-                        yield f"- {first_value}\n"
-
-        def _handle_regular_response(turn_response):
-            for response in turn_response:
-                if hasattr(response.event, "payload"):
-                    print(response.event.payload)
-                    if response.event.payload.event_type == "step_progress":
-                        if hasattr(response.event.payload.delta, "text"):
-                            yield response.event.payload.delta.text
-                    if response.event.payload.event_type == "step_complete":
-                        if response.event.payload.step_details.step_type == "tool_execution":
-                            if response.event.payload.step_details.tool_calls:
-                                tool_name = str(response.event.payload.step_details.tool_calls[0].tool_name)
-                                yield f'\n\n🛠 :grey[_Using "{tool_name}" tool:_]\n\n'
-                            else:
-                                yield "No tool_calls present in step_details"
-                else:
-                    yield f"Error occurred in the Llama Stack Cluster: {response}"
-
-        with st.chat_message("assistant"):
-            response_content = st.write_stream(response_generator(turn_response))
-
-        st.session_state.messages.append({"role": "assistant", "content": response_content})
-
-
-tool_chat_page()
--- a/src/llama_stack/core/ui/requirements.txt
+++ b/src/llama_stack/core/ui/requirements.txt
@ -1,5 +0,0 @@
-llama-stack>=0.2.1
-llama-stack-client>=0.2.1
-pandas
-streamlit
-streamlit-option-menu
--- a/src/llama_stack/core/utils/config_resolution.py
+++ b/src/llama_stack/core/utils/config_resolution.py
@ -52,7 +52,17 @@ def resolve_config_or_distro(
            logger.debug(f"Using distribution: {distro_config}")
            return distro_config

-    # Strategy 3: Try as built distribution name
+    # Strategy 3: Try as distro config path (if no .yaml extension and contains a slash)
+    # eg: starter::run-with-postgres-store.yaml
+    # Use :: to avoid slash and confusion with a filesystem path
+    if "::" in config_or_distro:
+        distro_name, config_name = config_or_distro.split("::")
+        distro_config = _get_distro_config_path(distro_name, config_name)
+        if distro_config.exists():
+            logger.info(f"Using distribution: {distro_config}")
+            return distro_config
+
+    # Strategy 4: Try as built distribution name
    distrib_config = DISTRIBS_BASE_DIR / f"llamastack-{config_or_distro}" / f"{config_or_distro}-{mode}.yaml"
    if distrib_config.exists():
        logger.debug(f"Using built distribution: {distrib_config}")
@ -63,13 +73,15 @@ def resolve_config_or_distro(
        logger.debug(f"Using built distribution: {distrib_config}")
        return distrib_config

-    # Strategy 4: Failed - provide helpful error
+    # Strategy 5: Failed - provide helpful error
    raise ValueError(_format_resolution_error(config_or_distro, mode))


-def _get_distro_config_path(distro_name: str, mode: Mode) -> Path:
+def _get_distro_config_path(distro_name: str, mode: str) -> Path:
    """Get the config file path for a distro."""
-    return DISTRO_DIR / distro_name / f"{mode}.yaml"
+    if not mode.endswith(".yaml"):
+        mode = f"{mode}.yaml"
+    return DISTRO_DIR / distro_name / mode


 def _format_resolution_error(config_or_distro: str, mode: Mode) -> str:
--- a/src/llama_stack/core/utils/exec.py
+++ b/src/llama_stack/core/utils/exec.py
@ -84,6 +84,15 @@ def run_command(command: list[str]) -> int:
            text=True,
            check=False,
        )
+
+        # Print stdout and stderr if command failed
+        if result.returncode != 0:
+            log.error(f"Command {' '.join(command)} failed with returncode {result.returncode}")
+            if result.stdout:
+                log.error(f"STDOUT: {result.stdout}")
+            if result.stderr:
+                log.error(f"STDERR: {result.stderr}")
+
        return result.returncode
    except subprocess.SubprocessError as e:
        log.error(f"Subprocess error: {e}")
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@ -56,4 +56,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
+- psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/src/llama_stack/distributions/ci-tests/ci_tests.py
@ -13,5 +13,6 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
    template = get_starter_distribution_template(name="ci-tests")
    template.description = "CI tests for Llama Stack"
+    template.run_configs.pop("run-with-postgres-store.yaml", None)

    return template
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/postgres-demo/init.py
+++ b/src/llama_stack/distributions/postgres-demo/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .postgres_demo import get_distribution_template  # noqa: F401
--- a/src/llama_stack/distributions/postgres-demo/build.yaml
+++ b/src/llama_stack/distributions/postgres-demo/build.yaml
@ -1,23 +0,0 @@
-version: 2
-distribution_spec:
-  description: Quick start template for running Llama Stack with several popular providers
-  providers:
-    inference:
-    - provider_type: remote::vllm
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: remote::chromadb
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-image_type: venv
-additional_pip_packages:
- asyncpg
- psycopg2-binary
- sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/postgres-demo/postgres_demo.py
+++ b/src/llama_stack/distributions/postgres-demo/postgres_demo.py
@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.apis.models import ModelType
-from llama_stack.core.datatypes import (
-    BuildProvider,
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.distributions.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import SentenceTransformersInferenceConfig
-from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
-from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
-from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
-from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
-
-
-def get_distribution_template() -> DistributionTemplate:
-    inference_providers = [
-        Provider(
-            provider_id="vllm-inference",
-            provider_type="remote::vllm",
-            config=VLLMInferenceAdapterConfig.sample_run_config(
-                url="${env.VLLM_URL:=http://localhost:8000/v1}",
-            ),
-        ),
-    ]
-    providers = {
-        "inference": [
-            BuildProvider(provider_type="remote::vllm"),
-            BuildProvider(provider_type="inline::sentence-transformers"),
-        ],
-        "vector_io": [BuildProvider(provider_type="remote::chromadb")],
-        "safety": [BuildProvider(provider_type="inline::llama-guard")],
-        "agents": [BuildProvider(provider_type="inline::meta-reference")],
-        "tool_runtime": [
-            BuildProvider(provider_type="remote::brave-search"),
-            BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
-            BuildProvider(provider_type="remote::model-context-protocol"),
-        ],
-    }
-    name = "postgres-demo"
-
-    vector_io_providers = [
-        Provider(
-            provider_id="${env.ENABLE_CHROMADB:+chromadb}",
-            provider_type="remote::chromadb",
-            config=ChromaVectorIOConfig.sample_run_config(
-                f"~/.llama/distributions/{name}",
-                url="${env.CHROMADB_URL:=}",
-            ),
-        ),
-    ]
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    default_models = [
-        ModelInput(
-            model_id="${env.INFERENCE_MODEL}",
-            provider_id="vllm-inference",
-        )
-    ]
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    embedding_model = ModelInput(
-        model_id="nomic-embed-text-v1.5",
-        provider_id=embedding_provider.provider_id,
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 768,
-        },
-    )
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Quick start template for running Llama Stack with several popular providers",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        available_models_by_provider={},
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": inference_providers + [embedding_provider],
-                    "vector_io": vector_io_providers,
-                },
-                default_models=default_models + [embedding_model],
-                default_tool_groups=default_tool_groups,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-                storage_backends={
-                    "kv_default": PostgresKVStoreConfig.sample_run_config(
-                        table_name="llamastack_kvstore",
-                    ),
-                    "sql_default": PostgresSqlStoreConfig.sample_run_config(),
-                },
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-        },
-    )
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
+- psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -0,0 +1,284 @@
+version: 2
+image_name: starter-gpu
+apis:
+- agents
+- batches
+- datasetio
+- eval
+- files
+- inference
+- post_training
+- safety
+- scoring
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
+    provider_type: remote::cerebras
+    config:
+      base_url: https://api.cerebras.ai
+      api_key: ${env.CEREBRAS_API_KEY:=}
+  - provider_id: ${env.OLLAMA_URL:+ollama}
+    provider_type: remote::ollama
+    config:
+      url: ${env.OLLAMA_URL:=http://localhost:11434}
+  - provider_id: ${env.VLLM_URL:+vllm}
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:=}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+  - provider_id: ${env.TGI_URL:+tgi}
+    provider_type: remote::tgi
+    config:
+      url: ${env.TGI_URL:=}
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference/v1
+      api_key: ${env.FIREWORKS_API_KEY:=}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY:=}
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
+  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
+    provider_type: remote::nvidia
+    config:
+      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      api_key: ${env.NVIDIA_API_KEY:=}
+      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:=}
+      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:=}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:=}
+  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
+    provider_type: remote::vertexai
+    config:
+      project: ${env.VERTEX_AI_PROJECT:=}
+      location: ${env.VERTEX_AI_LOCATION:=us-central1}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:=}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: ${env.AZURE_API_KEY:+azure}
+    provider_type: remote::azure
+    config:
+      api_key: ${env.AZURE_API_KEY:=}
+      api_base: ${env.AZURE_API_BASE:=}
+      api_version: ${env.AZURE_API_VERSION:=}
+      api_type: ${env.AZURE_API_TYPE:=}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      persistence:
+        namespace: vector_io::faiss
+        backend: kv_default
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
+      persistence:
+        namespace: vector_io::sqlite_vec
+        backend: kv_default
+  - provider_id: ${env.MILVUS_URL:+milvus}
+    provider_type: inline::milvus
+    config:
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
+      persistence:
+        namespace: vector_io::milvus
+        backend: kv_default
+  - provider_id: ${env.CHROMADB_URL:+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:=}
+      persistence:
+        namespace: vector_io::chroma_remote
+        backend: kv_default
+  - provider_id: ${env.PGVECTOR_DB:+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:=localhost}
+      port: ${env.PGVECTOR_PORT:=5432}
+      db: ${env.PGVECTOR_DB:=}
+      user: ${env.PGVECTOR_USER:=}
+      password: ${env.PGVECTOR_PASSWORD:=}
+      persistence:
+        namespace: vector_io::pgvector
+        backend: kv_default
+  - provider_id: ${env.QDRANT_URL:+qdrant}
+    provider_type: remote::qdrant
+    config:
+      api_key: ${env.QDRANT_API_KEY:=}
+      persistence:
+        namespace: vector_io::qdrant_remote
+        backend: kv_default
+  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
+    provider_type: remote::weaviate
+    config:
+      weaviate_api_key: null
+      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
+      persistence:
+        namespace: vector_io::weaviate
+        backend: kv_default
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
+      metadata_store:
+        table_name: files_metadata
+        backend: sql_default
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  - provider_id: code-scanner
+    provider_type: inline::code-scanner
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sql_postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: sql_postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+  post_training:
+  - provider_id: huggingface-gpu
+    provider_type: inline::huggingface-gpu
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
+      dpo_output_dir: ~/.llama/distributions/starter-gpu/dpo_output
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        namespace: eval
+        backend: kv_default
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        namespace: datasetio::huggingface
+        backend: kv_default
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        namespace: datasetio::localfs
+        backend: kv_default
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:=}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+  batches:
+  - provider_id: reference
+    provider_type: inline::reference
+    config:
+      kvstore:
+        namespace: batches
+        backend: kv_postgres
+storage:
+  backends:
+    kv_postgres:
+      type: kv_postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+    sql_postgres:
+      type: sql_postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_postgres
+    inference:
+      table_name: inference_store
+      backend: sql_postgres
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_postgres
+    prompts:
+      namespace: prompts
+      backend: kv_postgres
+registered_resources:
+  models: []
+  shields: []
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups: []
+server:
+  port: 8321
+telemetry:
+  enabled: true
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
+- psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -0,0 +1,281 @@
+version: 2
+image_name: starter
+apis:
+- agents
+- batches
+- datasetio
+- eval
+- files
+- inference
+- post_training
+- safety
+- scoring
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
+    provider_type: remote::cerebras
+    config:
+      base_url: https://api.cerebras.ai
+      api_key: ${env.CEREBRAS_API_KEY:=}
+  - provider_id: ${env.OLLAMA_URL:+ollama}
+    provider_type: remote::ollama
+    config:
+      url: ${env.OLLAMA_URL:=http://localhost:11434}
+  - provider_id: ${env.VLLM_URL:+vllm}
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:=}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+  - provider_id: ${env.TGI_URL:+tgi}
+    provider_type: remote::tgi
+    config:
+      url: ${env.TGI_URL:=}
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference/v1
+      api_key: ${env.FIREWORKS_API_KEY:=}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY:=}
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
+  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
+    provider_type: remote::nvidia
+    config:
+      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      api_key: ${env.NVIDIA_API_KEY:=}
+      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:=}
+      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:=}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:=}
+  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
+    provider_type: remote::vertexai
+    config:
+      project: ${env.VERTEX_AI_PROJECT:=}
+      location: ${env.VERTEX_AI_LOCATION:=us-central1}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:=}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: ${env.AZURE_API_KEY:+azure}
+    provider_type: remote::azure
+    config:
+      api_key: ${env.AZURE_API_KEY:=}
+      api_base: ${env.AZURE_API_BASE:=}
+      api_version: ${env.AZURE_API_VERSION:=}
+      api_type: ${env.AZURE_API_TYPE:=}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      persistence:
+        namespace: vector_io::faiss
+        backend: kv_default
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      persistence:
+        namespace: vector_io::sqlite_vec
+        backend: kv_default
+  - provider_id: ${env.MILVUS_URL:+milvus}
+    provider_type: inline::milvus
+    config:
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      persistence:
+        namespace: vector_io::milvus
+        backend: kv_default
+  - provider_id: ${env.CHROMADB_URL:+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:=}
+      persistence:
+        namespace: vector_io::chroma_remote
+        backend: kv_default
+  - provider_id: ${env.PGVECTOR_DB:+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:=localhost}
+      port: ${env.PGVECTOR_PORT:=5432}
+      db: ${env.PGVECTOR_DB:=}
+      user: ${env.PGVECTOR_USER:=}
+      password: ${env.PGVECTOR_PASSWORD:=}
+      persistence:
+        namespace: vector_io::pgvector
+        backend: kv_default
+  - provider_id: ${env.QDRANT_URL:+qdrant}
+    provider_type: remote::qdrant
+    config:
+      api_key: ${env.QDRANT_API_KEY:=}
+      persistence:
+        namespace: vector_io::qdrant_remote
+        backend: kv_default
+  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
+    provider_type: remote::weaviate
+    config:
+      weaviate_api_key: null
+      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
+      persistence:
+        namespace: vector_io::weaviate
+        backend: kv_default
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        table_name: files_metadata
+        backend: sql_default
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  - provider_id: code-scanner
+    provider_type: inline::code-scanner
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sql_postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: sql_postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+  post_training:
+  - provider_id: torchtune-cpu
+    provider_type: inline::torchtune-cpu
+    config:
+      checkpoint_format: meta
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        namespace: eval
+        backend: kv_default
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        namespace: datasetio::huggingface
+        backend: kv_default
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        namespace: datasetio::localfs
+        backend: kv_default
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:=}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+  batches:
+  - provider_id: reference
+    provider_type: inline::reference
+    config:
+      kvstore:
+        namespace: batches
+        backend: kv_postgres
+storage:
+  backends:
+    kv_postgres:
+      type: kv_postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+    sql_postgres:
+      type: sql_postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_postgres
+    inference:
+      table_name: inference_store
+      backend: sql_postgres
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_postgres
+    prompts:
+      namespace: prompts
+      backend: kv_postgres
+registered_resources:
+  models: []
+  shields: []
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups: []
+server:
+  port: 8321
+telemetry:
+  enabled: true
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -17,6 +17,11 @@ from llama_stack.core.datatypes import (
    ToolGroupInput,
    VectorStoresConfig,
 )
+from llama_stack.core.storage.datatypes import (
+    InferenceStoreReference,
+    KVStoreReference,
+    SqlStoreReference,
+)
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
 from llama_stack.providers.datatypes import RemoteProviderSpec
@ -36,6 +41,7 @@ from llama_stack.providers.remote.vector_io.pgvector.config import (
 )
 from llama_stack.providers.remote.vector_io.qdrant.config import QdrantVectorIOConfig
 from llama_stack.providers.remote.vector_io.weaviate.config import WeaviateVectorIOConfig
+from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig


@ -181,6 +187,62 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
            provider_shield_id="${env.CODE_SCANNER_MODEL:=}",
        ),
    ]
+    postgres_config = PostgresSqlStoreConfig.sample_run_config()
+    default_overrides = {
+        "inference": remote_inference_providers + [embedding_provider],
+        "vector_io": [
+            Provider(
+                provider_id="faiss",
+                provider_type="inline::faiss",
+                config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+            ),
+            Provider(
+                provider_id="sqlite-vec",
+                provider_type="inline::sqlite-vec",
+                config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+            ),
+            Provider(
+                provider_id="${env.MILVUS_URL:+milvus}",
+                provider_type="inline::milvus",
+                config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+            ),
+            Provider(
+                provider_id="${env.CHROMADB_URL:+chromadb}",
+                provider_type="remote::chromadb",
+                config=ChromaVectorIOConfig.sample_run_config(
+                    f"~/.llama/distributions/{name}/",
+                    url="${env.CHROMADB_URL:=}",
+                ),
+            ),
+            Provider(
+                provider_id="${env.PGVECTOR_DB:+pgvector}",
+                provider_type="remote::pgvector",
+                config=PGVectorVectorIOConfig.sample_run_config(
+                    f"~/.llama/distributions/{name}",
+                    db="${env.PGVECTOR_DB:=}",
+                    user="${env.PGVECTOR_USER:=}",
+                    password="${env.PGVECTOR_PASSWORD:=}",
+                ),
+            ),
+            Provider(
+                provider_id="${env.QDRANT_URL:+qdrant}",
+                provider_type="remote::qdrant",
+                config=QdrantVectorIOConfig.sample_run_config(
+                    f"~/.llama/distributions/{name}",
+                    url="${env.QDRANT_URL:=}",
+                ),
+            ),
+            Provider(
+                provider_id="${env.WEAVIATE_CLUSTER_URL:+weaviate}",
+                provider_type="remote::weaviate",
+                config=WeaviateVectorIOConfig.sample_run_config(
+                    f"~/.llama/distributions/{name}",
+                    cluster_url="${env.WEAVIATE_CLUSTER_URL:=}",
+                ),
+            ),
+        ],
+        "files": [files_provider],
+    }

    return DistributionTemplate(
        name=name,
@ -189,64 +251,10 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
        container_image=None,
        template_path=None,
        providers=providers,
-        additional_pip_packages=PostgresSqlStoreConfig.pip_packages(),
+        additional_pip_packages=list(set(PostgresSqlStoreConfig.pip_packages() + PostgresKVStoreConfig.pip_packages())),
        run_configs={
            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": remote_inference_providers + [embedding_provider],
-                    "vector_io": [
-                        Provider(
-                            provider_id="faiss",
-                            provider_type="inline::faiss",
-                            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-                        ),
-                        Provider(
-                            provider_id="sqlite-vec",
-                            provider_type="inline::sqlite-vec",
-                            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-                        ),
-                        Provider(
-                            provider_id="${env.MILVUS_URL:+milvus}",
-                            provider_type="inline::milvus",
-                            config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-                        ),
-                        Provider(
-                            provider_id="${env.CHROMADB_URL:+chromadb}",
-                            provider_type="remote::chromadb",
-                            config=ChromaVectorIOConfig.sample_run_config(
-                                f"~/.llama/distributions/{name}/",
-                                url="${env.CHROMADB_URL:=}",
-                            ),
-                        ),
-                        Provider(
-                            provider_id="${env.PGVECTOR_DB:+pgvector}",
-                            provider_type="remote::pgvector",
-                            config=PGVectorVectorIOConfig.sample_run_config(
-                                f"~/.llama/distributions/{name}",
-                                db="${env.PGVECTOR_DB:=}",
-                                user="${env.PGVECTOR_USER:=}",
-                                password="${env.PGVECTOR_PASSWORD:=}",
-                            ),
-                        ),
-                        Provider(
-                            provider_id="${env.QDRANT_URL:+qdrant}",
-                            provider_type="remote::qdrant",
-                            config=QdrantVectorIOConfig.sample_run_config(
-                                f"~/.llama/distributions/{name}",
-                                url="${env.QDRANT_URL:=}",
-                            ),
-                        ),
-                        Provider(
-                            provider_id="${env.WEAVIATE_CLUSTER_URL:+weaviate}",
-                            provider_type="remote::weaviate",
-                            config=WeaviateVectorIOConfig.sample_run_config(
-                                f"~/.llama/distributions/{name}",
-                                cluster_url="${env.WEAVIATE_CLUSTER_URL:=}",
-                            ),
-                        ),
-                    ],
-                    "files": [files_provider],
-                },
+                provider_overrides=default_overrides,
                default_models=[],
                default_tool_groups=default_tool_groups,
                default_shields=default_shields,
@ -261,6 +269,55 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
                    default_shield_id="llama-guard",
                ),
            ),
+            "run-with-postgres-store.yaml": RunConfigSettings(
+                provider_overrides={
+                    **default_overrides,
+                    "agents": [
+                        Provider(
+                            provider_id="meta-reference",
+                            provider_type="inline::meta-reference",
+                            config=dict(
+                                persistence_store=postgres_config,
+                                responses_store=postgres_config,
+                            ),
+                        )
+                    ],
+                    "batches": [
+                        Provider(
+                            provider_id="reference",
+                            provider_type="inline::reference",
+                            config=dict(
+                                kvstore=KVStoreReference(
+                                    backend="kv_postgres",
+                                    namespace="batches",
+                                ).model_dump(exclude_none=True),
+                            ),
+                        )
+                    ],
+                },
+                storage_backends={
+                    "kv_postgres": PostgresKVStoreConfig.sample_run_config(),
+                    "sql_postgres": postgres_config,
+                },
+                storage_stores={
+                    "metadata": KVStoreReference(
+                        backend="kv_postgres",
+                        namespace="registry",
+                    ).model_dump(exclude_none=True),
+                    "inference": InferenceStoreReference(
+                        backend="sql_postgres",
+                        table_name="inference_store",
+                    ).model_dump(exclude_none=True),
+                    "conversations": SqlStoreReference(
+                        backend="sql_postgres",
+                        table_name="openai_conversations",
+                    ).model_dump(exclude_none=True),
+                    "prompts": KVStoreReference(
+                        backend="kv_postgres",
+                        namespace="prompts",
+                    ).model_dump(exclude_none=True),
+                },
+            ),
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
--- a/src/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -146,7 +146,7 @@ class MetaReferenceInferenceImpl(
    def check_model(self, request) -> None:
        if self.model_id is None or self.llama_model is None:
            raise RuntimeError(
-                "No avaible model yet, please register your requested model or add your model in the resouces first"
+                "No available model yet, please register your requested model or add your model in the resources first"
            )
        elif request.model != self.model_id:
            raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
--- a/src/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
@ -91,7 +91,7 @@ class TorchtuneCheckpointer:
        if checkpoint_format == "meta" or checkpoint_format is None:
            self._save_meta_format_checkpoint(model_file_path, state_dict, adapter_only)
        elif checkpoint_format == "huggingface":
-            # Note: for saving hugging face format checkpoints, we only suppport saving adapter weights now
+            # Note: for saving hugging face format checkpoints, we only support saving adapter weights now
            self._save_hf_format_checkpoint(model_file_path, state_dict)
        else:
            raise ValueError(f"Unsupported checkpoint format: {format}")
--- a/src/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
@ -25,7 +25,7 @@ def llama_stack_instruct_to_torchtune_instruct(
    )
    input_messages = json.loads(sample[ColumnName.chat_completion_input.value])

-    assert len(input_messages) == 1, "llama stack intruct dataset format only supports 1 user message"
+    assert len(input_messages) == 1, "llama stack instruct dataset format only supports 1 user message"
    input_message = input_messages[0]

    assert "content" in input_message, "content not found in input message"
--- a/src/llama_stack/providers/registry/inference.py
+++ b/src/llama_stack/providers/registry/inference.py
@ -138,10 +138,11 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter_type="bedrock",
            provider_type="remote::bedrock",
-            pip_packages=["boto3"],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.bedrock",
            config_class="llama_stack.providers.remote.inference.bedrock.BedrockConfig",
-            description="AWS Bedrock inference provider for accessing various AI models through AWS's managed service.",
+            provider_data_validator="llama_stack.providers.remote.inference.bedrock.config.BedrockProviderDataValidator",
+            description="AWS Bedrock inference provider using OpenAI compatible endpoint.",
        ),
        RemoteProviderSpec(
            api=Api.inference,
--- a/src/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/README.md
@ -20,6 +20,7 @@ This provider enables dataset management using NVIDIA's NeMo Customizer service.
 Build the NVIDIA environment:

 ```bash
+uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```

--- a/src/llama_stack/providers/remote/inference/bedrock/init.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/init.py
@ -11,7 +11,7 @@ async def get_adapter_impl(config: BedrockConfig, _deps):

    assert isinstance(config, BedrockConfig), f"Unexpected config type: {type(config)}"

-    impl = BedrockInferenceAdapter(config)
+    impl = BedrockInferenceAdapter(config=config)

    await impl.initialize()

--- a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -4,139 +4,124 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import json
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Iterable

-from botocore.client import BaseClient
+from openai import AuthenticationError

 from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    Inference,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAICompletion,
    OpenAICompletionRequestWithExtraBody,
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
 )
-from llama_stack.apis.inference.inference import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAICompletion,
-)
-from llama_stack.providers.remote.inference.bedrock.config import BedrockConfig
-from llama_stack.providers.utils.bedrock.client import create_bedrock_client
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_strategy_options,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-)
+from llama_stack.core.telemetry.tracing import get_current_span
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

-from .models import MODEL_ENTRIES
+from .config import BedrockConfig

-REGION_PREFIX_MAP = {
-    "us": "us.",
-    "eu": "eu.",
-    "ap": "ap.",
-}
+logger = get_logger(name=__name__, category="inference::bedrock")


-def _get_region_prefix(region: str | None) -> str:
-    # AWS requires region prefixes for inference profiles
-    if region is None:
-        return "us."  # default to US when we don't know
+class BedrockInferenceAdapter(OpenAIMixin):
+    """
+    Adapter for AWS Bedrock's OpenAI-compatible API endpoints.

-    # Handle case insensitive region matching
-    region_lower = region.lower()
-    for prefix in REGION_PREFIX_MAP:
-        if region_lower.startswith(f"{prefix}-"):
-            return REGION_PREFIX_MAP[prefix]
+    Supports Llama models across regions and GPT-OSS models (us-west-2 only).

-    # Fallback to US for anything we don't recognize
-    return "us."
+    Note: Bedrock's OpenAI-compatible endpoint does not support /v1/models
+    for dynamic model discovery. Models must be pre-registered in the config.
+    """

+    config: BedrockConfig
+    provider_data_api_key_field: str = "aws_bedrock_api_key"

-def _to_inference_profile_id(model_id: str, region: str = None) -> str:
-    # Return ARNs unchanged
-    if model_id.startswith("arn:"):
-        return model_id
+    def get_base_url(self) -> str:
+        """Get base URL for OpenAI client."""
+        return f"https://bedrock-runtime.{self.config.region_name}.amazonaws.com/openai/v1"

-    # Return inference profile IDs that already have regional prefixes
-    if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()):
-        return model_id
+    async def list_provider_model_ids(self) -> Iterable[str]:
+        """
+        Bedrock's OpenAI-compatible endpoint does not support the /v1/models endpoint.
+        Returns empty list since models must be pre-registered in the config.
+        """
+        return []

-    # Default to US East when no region is provided
-    if region is None:
-        region = "us-east-1"
-
-    return _get_region_prefix(region) + model_id
-
-
-class BedrockInferenceAdapter(
-    ModelRegistryHelper,
-    Inference,
-):
-    def __init__(self, config: BedrockConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
-        self._config = config
-        self._client = None
-
-    @property
-    def client(self) -> BaseClient:
-        if self._client is None:
-            self._client = create_bedrock_client(self._config)
-        return self._client
-
-    async def initialize(self) -> None:
-        pass
-
-    async def shutdown(self) -> None:
-        if self._client is not None:
-            self._client.close()
-
-    async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> dict:
-        bedrock_model = request.model
-
-        sampling_params = request.sampling_params
-        options = get_sampling_strategy_options(sampling_params)
-
-        if sampling_params.max_tokens:
-            options["max_gen_len"] = sampling_params.max_tokens
-        if sampling_params.repetition_penalty > 0:
-            options["repetition_penalty"] = sampling_params.repetition_penalty
-
-        prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
-
-        # Convert foundation model ID to inference profile ID
-        region_name = self.client.meta.region_name
-        inference_profile_id = _to_inference_profile_id(bedrock_model, region_name)
-
-        return {
-            "modelId": inference_profile_id,
-            "body": json.dumps(
-                {
-                    "prompt": prompt,
-                    **options,
-                }
-            ),
-        }
+    async def check_model_availability(self, model: str) -> bool:
+        """
+        Bedrock doesn't support dynamic model listing via /v1/models.
+        Always return True to accept all models registered in the config.
+        """
+        return True

    async def openai_embeddings(
        self,
        params: OpenAIEmbeddingsRequestWithExtraBody,
    ) -> OpenAIEmbeddingsResponse:
-        raise NotImplementedError()
+        """Bedrock's OpenAI-compatible API does not support the /v1/embeddings endpoint."""
+        raise NotImplementedError(
+            "Bedrock's OpenAI-compatible API does not support /v1/embeddings endpoint. "
+            "See https://docs.aws.amazon.com/bedrock/latest/userguide/inference-chat-completions.html"
+        )

    async def openai_completion(
        self,
        params: OpenAICompletionRequestWithExtraBody,
    ) -> OpenAICompletion:
-        raise NotImplementedError("OpenAI completion not supported by the Bedrock provider")
+        """Bedrock's OpenAI-compatible API does not support the /v1/completions endpoint."""
+        raise NotImplementedError(
+            "Bedrock's OpenAI-compatible API does not support /v1/completions endpoint. "
+            "Only /v1/chat/completions is supported. "
+            "See https://docs.aws.amazon.com/bedrock/latest/userguide/inference-chat-completions.html"
+        )

    async def openai_chat_completion(
        self,
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        raise NotImplementedError("OpenAI chat completion not supported by the Bedrock provider")
+        """Override to enable streaming usage metrics and handle authentication errors."""
+        # Enable streaming usage metrics when telemetry is active
+        if params.stream and get_current_span() is not None:
+            if params.stream_options is None:
+                params.stream_options = {"include_usage": True}
+            elif "include_usage" not in params.stream_options:
+                params.stream_options = {**params.stream_options, "include_usage": True}
+
+        try:
+            logger.debug(f"Calling Bedrock OpenAI API with model={params.model}, stream={params.stream}")
+            result = await super().openai_chat_completion(params=params)
+            logger.debug(f"Bedrock API returned: {type(result).__name__ if result is not None else 'None'}")
+
+            if result is None:
+                logger.error(f"Bedrock OpenAI client returned None for model={params.model}, stream={params.stream}")
+                raise RuntimeError(
+                    f"Bedrock API returned no response for model '{params.model}'. "
+                    "This may indicate the model is not supported or a network/API issue occurred."
+                )
+
+            return result
+        except AuthenticationError as e:
+            error_msg = str(e)
+
+            # Check if this is a token expiration error
+            if "expired" in error_msg.lower() or "Bearer Token has expired" in error_msg:
+                logger.error(f"AWS Bedrock authentication token expired: {error_msg}")
+                raise ValueError(
+                    "AWS Bedrock authentication failed: Bearer token has expired. "
+                    "The AWS_BEDROCK_API_KEY environment variable contains an expired pre-signed URL. "
+                    "Please refresh your token by generating a new pre-signed URL with AWS credentials. "
+                    "Refer to AWS Bedrock documentation for details on OpenAI-compatible endpoints."
+                ) from e
+            else:
+                logger.error(f"AWS Bedrock authentication failed: {error_msg}")
+                raise ValueError(
+                    f"AWS Bedrock authentication failed: {error_msg}. "
+                    "Please verify your API key is correct in the provider config or x-llamastack-provider-data header. "
+                    "The API key should be a valid AWS pre-signed URL for Bedrock's OpenAI-compatible endpoint."
+                ) from e
+        except Exception as e:
+            logger.error(f"Unexpected error calling Bedrock API: {type(e).__name__}: {e}", exc_info=True)
+            raise
--- a/src/llama_stack/providers/remote/inference/bedrock/config.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/config.py
@ -4,8 +4,29 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
+import os
+
+from pydantic import BaseModel, Field
+
+from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig


-class BedrockConfig(BedrockBaseConfig):
-    pass
+class BedrockProviderDataValidator(BaseModel):
+    aws_bedrock_api_key: str | None = Field(
+        default=None,
+        description="API key for Amazon Bedrock",
+    )
+
+
+class BedrockConfig(RemoteInferenceProviderConfig):
+    region_name: str = Field(
+        default_factory=lambda: os.getenv("AWS_DEFAULT_REGION", "us-east-2"),
+        description="AWS Region for the Bedrock Runtime endpoint",
+    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs):
+        return {
+            "api_key": "${env.AWS_BEDROCK_API_KEY:=}",
+            "region_name": "${env.AWS_DEFAULT_REGION:=us-east-2}",
+        }
--- a/src/llama_stack/providers/remote/inference/bedrock/models.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/models.py
@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.models.llama.sku_types import CoreModelId
-from llama_stack.providers.utils.inference.model_registry import (
-    build_hf_repo_model_entry,
-)
-
-SAFETY_MODELS_ENTRIES = []
-
-
-# https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
-MODEL_ENTRIES = [
-    build_hf_repo_model_entry(
-        "meta.llama3-1-8b-instruct-v1:0",
-        CoreModelId.llama3_1_8b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "meta.llama3-1-70b-instruct-v1:0",
-        CoreModelId.llama3_1_70b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "meta.llama3-1-405b-instruct-v1:0",
-        CoreModelId.llama3_1_405b_instruct.value,
-    ),
-] + SAFETY_MODELS_ENTRIES
--- a/src/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/src/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -18,6 +18,7 @@ This provider enables running inference using NVIDIA NIM.
 Build the NVIDIA environment:

 ```bash
+uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```

@ -199,4 +200,4 @@ rerank_response = client.alpha.inference.rerank(

 for i, result in enumerate(rerank_response):
    print(f"{i+1}. [Index: {result.index}, " f"Score: {(result.relevance_score):.3f}]")
-```
+```
--- a/src/llama_stack/providers/remote/inference/passthrough/init.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/init.py
@ -10,8 +10,8 @@ from .config import PassthroughImplConfig


 class PassthroughProviderDataValidator(BaseModel):
-    url: str
-    api_key: str
+    passthrough_url: str
+    passthrough_api_key: str


 async def get_adapter_impl(config: PassthroughImplConfig, _deps):
--- a/src/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import Field, SecretStr
+from pydantic import Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.schema_utils import json_schema_type
@ -19,11 +19,6 @@ class PassthroughImplConfig(RemoteInferenceProviderConfig):
        description="The URL for the passthrough endpoint",
    )

-    api_key: SecretStr | None = Field(
-        default=None,
-        description="API Key for the passthrouth endpoint",
-    )
-
    @classmethod
    def sample_run_config(
        cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
--- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -5,9 +5,8 @@
 # the root directory of this source tree.

 from collections.abc import AsyncIterator
-from typing import Any

-from llama_stack_client import AsyncLlamaStackClient
+from openai import AsyncOpenAI

 from llama_stack.apis.inference import (
    Inference,
@ -20,103 +19,117 @@ from llama_stack.apis.inference import (
    OpenAIEmbeddingsResponse,
 )
 from llama_stack.apis.models import Model
-from llama_stack.core.library_client import convert_pydantic_to_json_value
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.core.request_headers import NeedsRequestProviderData

 from .config import PassthroughImplConfig


-class PassthroughInferenceAdapter(Inference):
+class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference):
    def __init__(self, config: PassthroughImplConfig) -> None:
-        ModelRegistryHelper.__init__(self)
        self.config = config

+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
    async def unregister_model(self, model_id: str) -> None:
        pass

    async def register_model(self, model: Model) -> Model:
        return model

-    def _get_client(self) -> AsyncLlamaStackClient:
-        passthrough_url = None
-        passthrough_api_key = None
-        provider_data = None
+    async def list_models(self) -> list[Model]:
+        """List models by calling the downstream /v1/models endpoint."""
+        client = self._get_openai_client()

-        if self.config.url is not None:
-            passthrough_url = self.config.url
-        else:
-            provider_data = self.get_request_provider_data()
-            if provider_data is None or not provider_data.passthrough_url:
-                raise ValueError(
-                    'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
-                )
-            passthrough_url = provider_data.passthrough_url
+        response = await client.models.list()

-        if self.config.api_key is not None:
-            passthrough_api_key = self.config.api_key.get_secret_value()
-        else:
-            provider_data = self.get_request_provider_data()
-            if provider_data is None or not provider_data.passthrough_api_key:
-                raise ValueError(
-                    'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
-                )
-            passthrough_api_key = provider_data.passthrough_api_key
+        # Convert from OpenAI format to Llama Stack Model format
+        models = []
+        for model_data in response.data:
+            downstream_model_id = model_data.id
+            custom_metadata = getattr(model_data, "custom_metadata", {}) or {}

-        return AsyncLlamaStackClient(
-            base_url=passthrough_url,
-            api_key=passthrough_api_key,
-            provider_data=provider_data,
+            # Prefix identifier with provider ID for local registry
+            local_identifier = f"{self.__provider_id__}/{downstream_model_id}"
+
+            model = Model(
+                identifier=local_identifier,
+                provider_id=self.__provider_id__,
+                provider_resource_id=downstream_model_id,
+                model_type=custom_metadata.get("model_type", "llm"),
+                metadata=custom_metadata,
+            )
+            models.append(model)
+
+        return models
+
+    async def should_refresh_models(self) -> bool:
+        """Passthrough should refresh models since they come from downstream dynamically."""
+        return self.config.refresh_models
+
+    def _get_openai_client(self) -> AsyncOpenAI:
+        """Get an AsyncOpenAI client configured for the downstream server."""
+        base_url = self._get_passthrough_url()
+        api_key = self._get_passthrough_api_key()
+
+        return AsyncOpenAI(
+            base_url=f"{base_url.rstrip('/')}/v1",
+            api_key=api_key,
        )

-    async def openai_embeddings(
-        self,
-        params: OpenAIEmbeddingsRequestWithExtraBody,
-    ) -> OpenAIEmbeddingsResponse:
-        raise NotImplementedError()
+    def _get_passthrough_url(self) -> str:
+        """Get the passthrough URL from config or provider data."""
+        if self.config.url is not None:
+            return self.config.url
+
+        provider_data = self.get_request_provider_data()
+        if provider_data is None:
+            raise ValueError(
+                'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
+            )
+        return provider_data.passthrough_url
+
+    def _get_passthrough_api_key(self) -> str:
+        """Get the passthrough API key from config or provider data."""
+        if self.config.auth_credential is not None:
+            return self.config.auth_credential.get_secret_value()
+
+        provider_data = self.get_request_provider_data()
+        if provider_data is None:
+            raise ValueError(
+                'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
+            )
+        return provider_data.passthrough_api_key

    async def openai_completion(
        self,
        params: OpenAICompletionRequestWithExtraBody,
    ) -> OpenAICompletion:
-        client = self._get_client()
-        model_obj = await self.model_store.get_model(params.model)
-
-        params = params.model_copy()
-        params.model = model_obj.provider_resource_id
-
+        """Forward completion request to downstream using OpenAI client."""
+        client = self._get_openai_client()
        request_params = params.model_dump(exclude_none=True)
-
-        return await client.inference.openai_completion(**request_params)
+        response = await client.completions.create(**request_params)
+        return response  # type: ignore

    async def openai_chat_completion(
        self,
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        client = self._get_client()
-        model_obj = await self.model_store.get_model(params.model)
-
-        params = params.model_copy()
-        params.model = model_obj.provider_resource_id
-
+        """Forward chat completion request to downstream using OpenAI client."""
+        client = self._get_openai_client()
        request_params = params.model_dump(exclude_none=True)
+        response = await client.chat.completions.create(**request_params)
+        return response  # type: ignore

-        return await client.inference.openai_chat_completion(**request_params)
-
-    def cast_value_to_json_dict(self, request_params: dict[str, Any]) -> dict[str, Any]:
-        json_params = {}
-        for key, value in request_params.items():
-            json_input = convert_pydantic_to_json_value(value)
-            if isinstance(json_input, dict):
-                json_input = {k: v for k, v in json_input.items() if v is not None}
-            elif isinstance(json_input, list):
-                json_input = [x for x in json_input if x is not None]
-                new_input = []
-                for x in json_input:
-                    if isinstance(x, dict):
-                        x = {k: v for k, v in x.items() if v is not None}
-                    new_input.append(x)
-                json_input = new_input
-
-            json_params[key] = json_input
-
-        return json_params
+    async def openai_embeddings(
+        self,
+        params: OpenAIEmbeddingsRequestWithExtraBody,
+    ) -> OpenAIEmbeddingsResponse:
+        """Forward embeddings request to downstream using OpenAI client."""
+        client = self._get_openai_client()
+        request_params = params.model_dump(exclude_none=True)
+        response = await client.embeddings.create(**request_params)
+        return response  # type: ignore
--- a/src/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/src/llama_stack/providers/remote/post_training/nvidia/README.md
@ -22,6 +22,7 @@ This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service
 Build the NVIDIA environment:

 ```bash
+uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```

--- a/src/llama_stack/providers/remote/safety/nvidia/README.md
+++ b/src/llama_stack/providers/remote/safety/nvidia/README.md
@ -19,6 +19,7 @@ This provider enables safety checks and guardrails for LLM interactions using NV
 Build the NVIDIA environment:

 ```bash
+uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```

--- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -26,6 +26,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreChunkingStrategy,
    VectorStoreChunkingStrategyAuto,
    VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreContent,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
@ -414,6 +415,10 @@ class OpenAIVectorStoreMixin(ABC):
            in_progress=0,
            total=0,
        )
+        if not params.chunking_strategy or params.chunking_strategy.type == "auto":
+            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
+        else:
+            chunking_strategy = params.chunking_strategy
        store_info: dict[str, Any] = {
            "id": vector_store_id,
            "object": "vector_store",
@ -426,7 +431,7 @@ class OpenAIVectorStoreMixin(ABC):
            "expires_at": None,
            "last_active_at": created_at,
            "file_ids": [],
-            "chunking_strategy": params.chunking_strategy,
+            "chunking_strategy": chunking_strategy.model_dump(),
        }

        # Add provider information to metadata if provided
@ -637,7 +642,7 @@ class OpenAIVectorStoreMixin(ABC):
                    break

            return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                data=data,
                has_more=False,  # For simplicity, we don't implement pagination here
                next_page=None,
@ -647,7 +652,7 @@ class OpenAIVectorStoreMixin(ABC):
            logger.error(f"Error searching vector store {vector_store_id}: {e}")
            # Return empty results on error
            return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                data=[],
                has_more=False,
                next_page=None,