Merge branch 'main' into chunk-metadata

2025-07-16 09:58:10 +00:00 · 2025-06-25 12:57:50 -06:00 · 2025-06-25 12:57:50 -06:00 · f52eb51555
commit f52eb51555
parent 7ed916dbb3 fa0b0c13d4
40 changed files with 272 additions and 722 deletions
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -5,10 +5,10 @@
 # the root directory of this source tree.

 from enum import Enum
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any, Literal, Protocol

 from pydantic import BaseModel, Field, field_validator
-from typing_extensions import Protocol, runtime_checkable
+from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -5,10 +5,10 @@
 # the root directory of this source tree.

 from enum import Enum
-from typing import Any, Literal
+from typing import Any, Literal, Protocol

 from pydantic import BaseModel, Field
-from typing_extensions import Protocol, runtime_checkable
+from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.resource import Resource, ResourceType
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -432,6 +432,7 @@ class VectorIO(Protocol):
        max_num_results: int | None = 10,
        ranking_options: SearchRankingOptions | None = None,
        rewrite_query: bool | None = False,
+        search_mode: str | None = "vector",  # Using str instead of Literal due to OpenAPI schema generator limitations
    ) -> VectorStoreSearchResponsePage:
        """Search for chunks in a vector store.

@ -443,6 +444,7 @@ class VectorIO(Protocol):
        :param max_num_results: Maximum number of results to return (1 to 50 inclusive, default 10).
        :param ranking_options: Ranking options for fine-tuning the search results.
        :param rewrite_query: Whether to rewrite the natural language query for vector search (default false)
+        :param search_mode: The search mode to use - "keyword", "vector", or "hybrid" (default "vector")
        :returns: A VectorStoreSearchResponse containing the search results.
        """
        ...
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -101,7 +101,7 @@ def build_image(
    template_or_config: str,
    run_config: str | None = None,
 ):
-    container_base = build_config.distribution_spec.container_image or "python:3.11-slim"
+    container_base = build_config.distribution_spec.container_image or "python:3.12-slim"

    normal_deps, special_deps = get_provider_dependencies(build_config)
    normal_deps += SERVER_DEPENDENCIES
--- a/llama_stack/distribution/build_conda_env.sh
+++ b/llama_stack/distribution/build_conda_env.sh
@ -49,7 +49,7 @@ ensure_conda_env_python310() {
  local env_name="$1"
  local pip_dependencies="$2"
  local special_pip_deps="$3"
-  local python_version="3.11"
+  local python_version="3.12"

  # Check if conda command is available
  if ! is_command_available conda; then
@ -81,19 +81,19 @@ ensure_conda_env_python310() {
  eval "$(conda shell.bash hook)"
  conda deactivate && conda activate "${env_name}"

-  $CONDA_PREFIX/bin/pip install uv
+  "$CONDA_PREFIX"/bin/pip install uv

  if [ -n "$TEST_PYPI_VERSION" ]; then
    # these packages are damaged in test-pypi, so install them first
    uv pip install fastapi libcst
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
-      llama-stack==$TEST_PYPI_VERSION \
-      $pip_dependencies
+      llama-stack=="$TEST_PYPI_VERSION" \
+      "$pip_dependencies"
    if [ -n "$special_pip_deps" ]; then
      IFS='#' read -ra parts <<<"$special_pip_deps"
      for part in "${parts[@]}"; do
        echo "$part"
-        uv pip install $part
+        uv pip install "$part"
      done
    fi
  else
@ -113,7 +113,7 @@ ensure_conda_env_python310() {
      else
        SPEC_VERSION="llama-stack"
      fi
-      uv pip install --no-cache-dir $SPEC_VERSION
+      uv pip install --no-cache-dir "$SPEC_VERSION"
    fi

    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
@ -138,7 +138,7 @@ ensure_conda_env_python310() {
    fi
  fi

-  mv $build_file_path $CONDA_PREFIX/llamastack-build.yaml
+  mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
  echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
 }

--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -84,7 +84,7 @@ add_to_container() {
 }

 # Check if container command is available
-if ! is_command_available $CONTAINER_BINARY; then
+if ! is_command_available "$CONTAINER_BINARY"; then
  printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
  exit 1
 fi
@ -95,13 +95,13 @@ if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
 FROM $container_base
 WORKDIR /app

-# We install the Python 3.11 dev headers and build tools so that any
+# We install the Python 3.12 dev headers and build tools so that any
 # C‑extension wheels (e.g. polyleven, faiss‑cpu) can compile successfully.

 RUN dnf -y update && dnf install -y iputils git net-tools wget \
-    vim-minimal python3.11 python3.11-pip python3.11-wheel \
-    python3.11-setuptools python3.11-devel gcc make && \
-    ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all
+    vim-minimal python3.12 python3.12-pip python3.12-wheel \
+    python3.12-setuptools python3.12-devel gcc make && \
+    ln -s /bin/pip3.12 /bin/pip && ln -s /bin/python3.12 /bin/python && dnf clean all

 ENV UV_SYSTEM_PYTHON=1
 RUN pip install uv
@ -163,9 +163,6 @@ EOF
 if [ -n "$run_config" ]; then
  # Copy the run config to the build context since it's an absolute path
  cp "$run_config" "$BUILD_CONTEXT_DIR/run.yaml"
-  add_to_container << EOF
-COPY run.yaml $RUN_CONFIG_PATH
-EOF

  # Parse the run.yaml configuration to identify external provider directories
  # If external providers are specified, copy their directory to the container
@ -173,12 +170,15 @@ EOF
  python_cmd=$(get_python_cmd)
  external_providers_dir=$($python_cmd -c "import yaml; config = yaml.safe_load(open('$run_config')); print(config.get('external_providers_dir') or '')")
  external_providers_dir=$(eval echo "$external_providers_dir")
-  if [ -n "$external_providers_dir" ] && [ -d "$external_providers_dir" ]; then
+  if [ -n "$external_providers_dir" ]; then
+    if [ -d "$external_providers_dir" ]; then
    echo "Copying external providers directory: $external_providers_dir"
    cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
    add_to_container << EOF
 COPY providers.d /.llama/providers.d
 EOF
+    fi
+
    # Edit the run.yaml file to change the external_providers_dir to /.llama/providers.d
    if [ "$(uname)" = "Darwin" ]; then
      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
@ -187,6 +187,11 @@ EOF
      sed -i 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
    fi
  fi
+
+  # Copy run config into docker image
+  add_to_container << EOF
+COPY run.yaml $RUN_CONFIG_PATH
+EOF
 fi

 stack_mount="/app/llama-stack-source"
--- a/llama_stack/distribution/routers/vector_io.py
+++ b/llama_stack/distribution/routers/vector_io.py
@ -255,6 +255,7 @@ class VectorIORouter(VectorIO):
        max_num_results: int | None = 10,
        ranking_options: SearchRankingOptions | None = None,
        rewrite_query: bool | None = False,
+        search_mode: str | None = "vector",
    ) -> VectorStoreSearchResponsePage:
        logger.debug(f"VectorIORouter.openai_search_vector_store: {vector_store_id}")
        # Route based on vector store ID
@ -266,6 +267,7 @@ class VectorIORouter(VectorIO):
            max_num_results=max_num_results,
            ranking_options=ranking_options,
            rewrite_query=rewrite_query,
+            search_mode=search_mode,
        )

    async def openai_attach_file_to_vector_store(
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -98,6 +98,15 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):

        method = getattr(impls[api], register_method)
        for obj in objects:
+            # In complex templates, like our starter template, we may have dynamic model ids
+            # given by environment variables. This allows those environment variables to have
+            # a default value of __disabled__ to skip registration of the model if not set.
+            if (
+                hasattr(obj, "provider_model_id")
+                and obj.provider_model_id is not None
+                and "__disabled__" in obj.provider_model_id
+            ):
+                continue
            # we want to maintain the type information in arguments to method.
            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
            # we use model_dump() to find all the attrs and then getattr to get the still typed value.
--- a/llama_stack/providers/remote/inference/ollama/init.py
+++ b/llama_stack/providers/remote/inference/ollama/init.py
@ -10,6 +10,6 @@ from .config import OllamaImplConfig
 async def get_adapter_impl(config: OllamaImplConfig, _deps):
    from .ollama import OllamaInferenceAdapter

-    impl = OllamaInferenceAdapter(config.url)
+    impl = OllamaInferenceAdapter(config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -13,7 +13,13 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"

 class OllamaImplConfig(BaseModel):
    url: str = DEFAULT_OLLAMA_URL
+    raise_on_connect_error: bool = True

    @classmethod
-    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]:
-        return {"url": url}
+    def sample_run_config(
+        cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs
+    ) -> dict[str, Any]:
+        return {
+            "url": url,
+            "raise_on_connect_error": raise_on_connect_error,
+        }
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -9,7 +9,6 @@ import uuid
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

-import httpx
 from ollama import AsyncClient  # type: ignore[attr-defined]
 from openai import AsyncOpenAI

@ -57,6 +56,7 @@ from llama_stack.providers.datatypes import (
    HealthStatus,
    ModelsProtocolPrivate,
 )
+from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
@ -90,9 +90,10 @@ class OllamaInferenceAdapter(
    InferenceProvider,
    ModelsProtocolPrivate,
 ):
-    def __init__(self, url: str) -> None:
+    def __init__(self, config: OllamaImplConfig) -> None:
        self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
-        self.url = url
+        self.url = config.url
+        self.raise_on_connect_error = config.raise_on_connect_error

    @property
    def client(self) -> AsyncClient:
@ -103,8 +104,13 @@ class OllamaInferenceAdapter(
        return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")

    async def initialize(self) -> None:
-        logger.info(f"checking connectivity to Ollama at `{self.url}`...")
-        await self.health()
+        logger.debug(f"checking connectivity to Ollama at `{self.url}`...")
+        health_response = await self.health()
+        if health_response["status"] == HealthStatus.ERROR:
+            if self.raise_on_connect_error:
+                raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
+            else:
+                logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal")

    async def health(self) -> HealthResponse:
        """
@ -117,10 +123,8 @@ class OllamaInferenceAdapter(
        try:
            await self.client.ps()
            return HealthResponse(status=HealthStatus.OK)
-        except httpx.ConnectError as e:
-            raise RuntimeError(
-                "Ollama Server is not running, start it using `ollama serve` in a separate terminal"
-            ) from e
+        except Exception as e:
+            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")

    async def shutdown(self) -> None:
        pass
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

 import httpx
-from openai import AsyncOpenAI
+from openai import APIConnectionError, AsyncOpenAI
 from openai.types.chat.chat_completion_chunk import (
    ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
@ -461,7 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            model = await self.register_helper.register_model(model)
        except ValueError:
            pass  # Ignore statically unknown model, will check live listing
-        res = await client.models.list()
+        try:
+            res = await client.models.list()
+        except APIConnectionError as e:
+            raise ValueError(
+                f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
+            ) from e
        available_models = [m.id async for m in res]
        if model.provider_resource_id not in available_models:
            raise ValueError(
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -19,17 +19,16 @@ from llama_stack.apis.vector_io import (
    QueryChunksResponse,
    SearchRankingOptions,
    VectorIO,
+    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
+    VectorStoreFileContentsResponse,
+    VectorStoreFileObject,
+    VectorStoreFileStatus,
+    VectorStoreListFilesResponse,
    VectorStoreListResponse,
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.apis.vector_io.vector_io import (
-    VectorStoreChunkingStrategy,
-    VectorStoreFileContentsResponse,
-    VectorStoreFileObject,
-    VectorStoreListFilesResponse,
-)
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
@ -257,6 +256,7 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        max_num_results: int | None = 10,
        ranking_options: SearchRankingOptions | None = None,
        rewrite_query: bool | None = False,
+        search_mode: str | None = "vector",
    ) -> VectorStoreSearchResponsePage:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")

@ -272,6 +272,11 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def openai_list_files_in_vector_store(
        self,
        vector_store_id: str,
+        limit: int | None = 20,
+        order: str | None = "desc",
+        after: str | None = None,
+        before: str | None = None,
+        filter: VectorStoreFileStatus | None = None,
    ) -> VectorStoreListFilesResponse:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")

--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -21,17 +21,16 @@ from llama_stack.apis.vector_io import (
    QueryChunksResponse,
    SearchRankingOptions,
    VectorIO,
+    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
+    VectorStoreFileContentsResponse,
+    VectorStoreFileObject,
+    VectorStoreFileStatus,
+    VectorStoreListFilesResponse,
    VectorStoreListResponse,
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.apis.vector_io.vector_io import (
-    VectorStoreChunkingStrategy,
-    VectorStoreFileContentsResponse,
-    VectorStoreFileObject,
-    VectorStoreListFilesResponse,
-)
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
@ -255,8 +254,9 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        max_num_results: int | None = 10,
        ranking_options: SearchRankingOptions | None = None,
        rewrite_query: bool | None = False,
+        search_mode: str | None = "vector",
    ) -> VectorStoreSearchResponsePage:
-        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus")

    async def openai_attach_file_to_vector_store(
        self,
@ -270,6 +270,11 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def openai_list_files_in_vector_store(
        self,
        vector_store_id: str,
+        limit: int | None = 20,
+        order: str | None = "desc",
+        after: str | None = None,
+        before: str | None = None,
+        filter: VectorStoreFileStatus | None = None,
    ) -> VectorStoreListFilesResponse:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus")

--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -19,17 +19,16 @@ from llama_stack.apis.vector_io import (
    QueryChunksResponse,
    SearchRankingOptions,
    VectorIO,
+    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
+    VectorStoreFileContentsResponse,
+    VectorStoreFileObject,
+    VectorStoreFileStatus,
+    VectorStoreListFilesResponse,
    VectorStoreListResponse,
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.apis.vector_io.vector_io import (
-    VectorStoreChunkingStrategy,
-    VectorStoreFileContentsResponse,
-    VectorStoreFileObject,
-    VectorStoreListFilesResponse,
-)
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
@ -257,6 +256,7 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        max_num_results: int | None = 10,
        ranking_options: SearchRankingOptions | None = None,
        rewrite_query: bool | None = False,
+        search_mode: str | None = "vector",
    ) -> VectorStoreSearchResponsePage:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")

@ -272,6 +272,11 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def openai_list_files_in_vector_store(
        self,
        vector_store_id: str,
+        limit: int | None = 20,
+        order: str | None = "desc",
+        after: str | None = None,
+        before: str | None = None,
+        filter: VectorStoreFileStatus | None = None,
    ) -> VectorStoreListFilesResponse:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")

--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -16,20 +16,14 @@ from llama_stack.apis.files import Files
 from llama_stack.apis.files.files import OpenAIFileObject
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import (
+    Chunk,
    QueryChunksResponse,
    SearchRankingOptions,
-    VectorStoreContent,
-    VectorStoreDeleteResponse,
-    VectorStoreListResponse,
-    VectorStoreObject,
-    VectorStoreSearchResponse,
-    VectorStoreSearchResponsePage,
-)
-from llama_stack.apis.vector_io.vector_io import (
-    Chunk,
    VectorStoreChunkingStrategy,
    VectorStoreChunkingStrategyAuto,
    VectorStoreChunkingStrategyStatic,
+    VectorStoreContent,
+    VectorStoreDeleteResponse,
    VectorStoreFileContentsResponse,
    VectorStoreFileCounts,
    VectorStoreFileDeleteResponse,
@ -37,6 +31,10 @@ from llama_stack.apis.vector_io.vector_io import (
    VectorStoreFileObject,
    VectorStoreFileStatus,
    VectorStoreListFilesResponse,
+    VectorStoreListResponse,
+    VectorStoreObject,
+    VectorStoreSearchResponse,
+    VectorStoreSearchResponsePage,
 )
 from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks

@ -339,13 +337,16 @@ class OpenAIVectorStoreMixin(ABC):
        max_num_results: int | None = 10,
        ranking_options: SearchRankingOptions | None = None,
        rewrite_query: bool | None = False,
-        # search_mode: Literal["keyword", "vector", "hybrid"] = "vector",
+        search_mode: str | None = "vector",  # Using str instead of Literal due to OpenAPI schema generator limitations
    ) -> VectorStoreSearchResponsePage:
        """Search for chunks in a vector store."""
-        # TODO: Add support in the API for this
-        search_mode = "vector"
        max_num_results = max_num_results or 10

+        # Validate search_mode
+        valid_modes = {"keyword", "vector", "hybrid"}
+        if search_mode not in valid_modes:
+            raise ValueError(f"search_mode must be one of {valid_modes}, got {search_mode}")
+
        if vector_store_id not in self.openai_vector_stores:
            raise ValueError(f"Vector store {vector_store_id} not found")

--- a/llama_stack/providers/utils/scheduler.py
+++ b/llama_stack/providers/utils/scheduler.py
@ -11,7 +11,7 @@ import threading
 from collections.abc import Callable, Coroutine, Iterable
 from datetime import UTC, datetime
 from enum import Enum
-from typing import Any, TypeAlias
+from typing import Any

 from pydantic import BaseModel

@ -30,8 +30,8 @@ class JobStatus(Enum):
    completed = "completed"


-JobID: TypeAlias = str
-JobType: TypeAlias = str
+type JobID = str
+type JobType = str


 class JobArtifact(BaseModel):
@ -47,7 +47,7 @@ JobHandler = Callable[
 ]


-LogMessage: TypeAlias = tuple[datetime, str]
+type LogMessage = tuple[datetime, str]


 _COMPLETED_STATUSES = {JobStatus.completed, JobStatus.failed}
--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -5,15 +5,7 @@
 # the root directory of this source tree.

 from contextlib import asynccontextmanager
-from typing import Any
-
-try:
-    # for python < 3.11
-    import exceptiongroup
-
-    BaseExceptionGroup = exceptiongroup.BaseExceptionGroup
-except ImportError:
-    pass
+from typing import Any, cast

 import httpx
 from mcp import ClientSession
@ -40,14 +32,14 @@ async def sse_client_wrapper(endpoint: str, headers: dict[str, str]):
            async with ClientSession(*streams) as session:
                await session.initialize()
                yield session
-    except BaseException as e:
-        if isinstance(e, BaseExceptionGroup):
-            for exc in e.exceptions:
-                if isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code == 401:
-                    raise AuthenticationRequiredError(exc) from exc
-        elif isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 401:
-            raise AuthenticationRequiredError(e) from e
-
+    except* httpx.HTTPStatusError as eg:
+        for exc in eg.exceptions:
+            # mypy does not currently narrow the type of `eg.exceptions` based on the `except*` filter,
+            # so we explicitly cast each item to httpx.HTTPStatusError. This is safe because
+            # `except* httpx.HTTPStatusError` guarantees all exceptions in `eg.exceptions` are of that type.
+            err = cast(httpx.HTTPStatusError, exc)
+            if err.response.status_code == 401:
+                raise AuthenticationRequiredError(exc) from exc
        raise


--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -18,6 +18,7 @@ providers:
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:http://localhost:11434}
+      raise_on_connect_error: true
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -18,6 +18,7 @@ providers:
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:http://localhost:11434}
+      raise_on_connect_error: true
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
--- a/llama_stack/templates/starter/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@ -31,6 +31,7 @@ providers:
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:http://localhost:11434}
+      raise_on_connect_error: false
  - provider_id: anthropic
    provider_type: remote::anthropic
    config:
@ -60,7 +61,14 @@ providers:
    provider_type: inline::sentence-transformers
    config: {}
  vector_io:
-  - provider_id: sqlite-vec
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/faiss_store.db
+  - provider_id: ${env.ENABLE_SQLITE_VEC+sqlite-vec}
    provider_type: inline::sqlite-vec
    config:
      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db
@ -530,160 +538,15 @@ models:
  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
  model_type: llm
 - metadata: {}
-  model_id: ollama/llama3.1:8b-instruct-fp16
+  model_id: ollama/${env.OLLAMA_INFERENCE_MODEL:__disabled__}
  provider_id: ollama
-  provider_model_id: llama3.1:8b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.1-8B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.1:8b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.1:8b
-  provider_id: ollama
-  provider_model_id: llama3.1:8b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.1:70b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.1:70b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.1-70B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.1:70b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.1:70b
-  provider_id: ollama
-  provider_model_id: llama3.1:70b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.1:405b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.1:405b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: ollama
-  provider_model_id: llama3.1:405b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.1:405b
-  provider_id: ollama
-  provider_model_id: llama3.1:405b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2:1b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2:1b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-1B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2:1b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2:1b
-  provider_id: ollama
-  provider_model_id: llama3.2:1b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2:3b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2:3b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-3B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2:3b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2:3b
-  provider_id: ollama
-  provider_model_id: llama3.2:3b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2-vision:11b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:11b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:11b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2-vision:latest
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:latest
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2-vision:90b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:90b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:90b-instruct-fp16
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.2-vision:90b
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:90b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama3.3:70b
-  provider_id: ollama
-  provider_model_id: llama3.3:70b
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.3-70B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.3:70b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama-guard3:8b
-  provider_id: ollama
-  provider_model_id: llama-guard3:8b
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-Guard-3-8B
-  provider_id: ollama
-  provider_model_id: llama-guard3:8b
-  model_type: llm
- metadata: {}
-  model_id: ollama/llama-guard3:1b
-  provider_id: ollama
-  provider_model_id: llama-guard3:1b
-  model_type: llm
- metadata: {}
-  model_id: ollama/meta-llama/Llama-Guard-3-1B
-  provider_id: ollama
-  provider_model_id: llama-guard3:1b
+  provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:__disabled__}
  model_type: llm
 - metadata:
-    embedding_dimension: 384
-    context_length: 512
-  model_id: ollama/all-minilm:latest
+    embedding_dimension: ${env.OLLAMA_EMBEDDING_DIMENSION:384}
+  model_id: ollama/${env.OLLAMA_EMBEDDING_MODEL:__disabled__}
  provider_id: ollama
-  provider_model_id: all-minilm:latest
-  model_type: embedding
- metadata:
-    embedding_dimension: 384
-    context_length: 512
-  model_id: ollama/all-minilm
-  provider_id: ollama
-  provider_model_id: all-minilm:latest
-  model_type: embedding
- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: ollama/nomic-embed-text
-  provider_id: ollama
-  provider_model_id: nomic-embed-text
+  provider_model_id: ${env.OLLAMA_EMBEDDING_MODEL:__disabled__}
  model_type: embedding
 - metadata: {}
  model_id: anthropic/claude-3-5-sonnet-latest
@ -938,6 +801,11 @@ models:
  provider_id: sambanova
  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
  model_type: llm
+- metadata: {}
+  model_id: vllm/${env.VLLM_INFERENCE_MODEL:__disabled__}
+  provider_id: vllm
+  provider_model_id: ${env.VLLM_INFERENCE_MODEL:__disabled__}
+  model_type: llm
 - metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
--- a/llama_stack/templates/starter/starter.py
+++ b/llama_stack/templates/starter/starter.py
@ -16,6 +16,7 @@ from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplCo
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
    SQLiteVectorIOConfig,
 )
@ -36,9 +37,6 @@ from llama_stack.providers.remote.inference.groq.models import (
    MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
 )
 from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
-from llama_stack.providers.remote.inference.ollama.models import (
-    MODEL_ENTRIES as OLLAMA_MODEL_ENTRIES,
-)
 from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
 from llama_stack.providers.remote.inference.openai.models import (
    MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
@ -85,8 +83,22 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
        ),
        (
            "ollama",
-            OLLAMA_MODEL_ENTRIES,
-            OllamaImplConfig.sample_run_config(),
+            [
+                ProviderModelEntry(
+                    provider_model_id="${env.OLLAMA_INFERENCE_MODEL:__disabled__}",
+                    model_type=ModelType.llm,
+                ),
+                ProviderModelEntry(
+                    provider_model_id="${env.OLLAMA_EMBEDDING_MODEL:__disabled__}",
+                    model_type=ModelType.embedding,
+                    metadata={
+                        "embedding_dimension": "${env.OLLAMA_EMBEDDING_DIMENSION:384}",
+                    },
+                ),
+            ],
+            OllamaImplConfig.sample_run_config(
+                url="${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error=False
+            ),
        ),
        (
            "anthropic",
@ -110,7 +122,12 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
        ),
        (
            "vllm",
-            [],
+            [
+                ProviderModelEntry(
+                    provider_model_id="${env.VLLM_INFERENCE_MODEL:__disabled__}",
+                    model_type=ModelType.llm,
+                ),
+            ],
            VLLMInferenceAdapterConfig.sample_run_config(
                url="${env.VLLM_URL:http://localhost:8000/v1}",
            ),
@ -153,7 +170,12 @@ def get_distribution_template() -> DistributionTemplate:

    vector_io_providers = [
        Provider(
-            provider_id="sqlite-vec",
+            provider_id="faiss",
+            provider_type="inline::faiss",
+            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_SQLITE_VEC+sqlite-vec}",
            provider_type="inline::sqlite-vec",
            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
        ),
@ -257,7 +279,27 @@ def get_distribution_template() -> DistributionTemplate:
            ),
            "VLLM_URL": (
                "http://localhost:8000/v1",
-                "VLLM URL",
+                "vLLM URL",
+            ),
+            "VLLM_INFERENCE_MODEL": (
+                "",
+                "Optional vLLM Inference Model to register on startup",
+            ),
+            "OLLAMA_URL": (
+                "http://localhost:11434",
+                "Ollama URL",
+            ),
+            "OLLAMA_INFERENCE_MODEL": (
+                "",
+                "Optional Ollama Inference Model to register on startup",
+            ),
+            "OLLAMA_EMBEDDING_MODEL": (
+                "",
+                "Optional Ollama Embedding Model to register on startup",
+            ),
+            "OLLAMA_EMBEDDING_DIMENSION": (
+                "384",
+                "Ollama Embedding Dimension",
            ),
        },
    )
--- a/llama_stack/ui/hooks/usePagination.ts
+++ b/llama_stack/ui/hooks/usePagination.ts
@ -56,6 +56,9 @@ export function usePagination<T>({
  const stateRef = useRef(state);
  stateRef.current = state;

+  // Track if initial data has been fetched
+  const hasFetchedInitialData = useRef(false);
+
  /**
   * Fetches data from the API with cursor-based pagination
   */
@ -119,8 +122,11 @@ export function usePagination<T>({

  // Auto-load initial data on mount
  useEffect(() => {
-    fetchData();
-  }, []);
+    if (!hasFetchedInitialData.current) {
+      hasFetchedInitialData.current = true;
+      fetchData();
+    }
+  }, [fetchData]);

  return {
    data: state.data,