From fa0b0c13d41a6125b743c774c8870a760a9d2ebe Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Wed, 25 Jun 2025 09:54:00 -0400
Subject: [PATCH] fix: Ollama should be optional in starter distro (#2482)

# What does this PR do?

Our starter distro required Ollama to be running (and a large list of
models available in that Ollama) to successfully start. This adjusts
things so that Ollama does not have to be running to use the starter
template / distro.

To accomplish this, a few changes were needed:

* The Ollama provider is now configurable whether it raises an Exception
or just logs a warning when it cannot reach the Ollama server on
startup. The default is to raise an exception (same as previous
behavior), but in the starter template we adjust this to just log a
warning so that we can bring the stack up without needing a running
Ollama server.

* The starter template no longer specifies a default list of models for
Ollama, as any models specified there need to actually be pulled and
available in Ollama. Instead, it adds a new
`OLLAMA_INFERENCE_MODEL` environment variable where users can provide an
optional model to register with the Ollama provider on startup.
Additional models can also be registered via the typical
`models.register(...)` at runtime.

* The vLLM template was adjusted to also allow an optional
`VLLM_INFERENCE_MODEL` specified on startup, so that the behavior
between vLLM and Ollama was consistent here to make it easy to get up
and running quickly.

* The default vector store was changed from sqlite-vec to faiss.
sqlite-vec can enabled via setting the `ENABLE_SQLITE_VEC` environment
variable, like we do for chromadb and pgvector. This is due to
sqlite-vec not shipping proper arm64 binaries, like we previously fixed
in #1530 for the ollama distribution.

## Test Plan

With this change, the following scenarios now work with the starter
template that did not before:

* no Ollama running
* Ollama running but not all of the Llama models pulled locally
* Ollama running with a custom model registered on startup
* vLLM running with a custom model registered on startup
* running the starter template on linux/arm64, like when running
containers on Mac without rosetta emulation

---------

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 docs/source/distributions/configuration.md    |  12 ++
 llama_stack/distribution/stack.py             |   9 +
 .../remote/inference/ollama/__init__.py       |   2 +-
 .../remote/inference/ollama/config.py         |  10 +-
 .../remote/inference/ollama/ollama.py         |  22 ++-
 .../providers/remote/inference/vllm/vllm.py   |   9 +-
 .../templates/ollama/run-with-safety.yaml     |   1 +
 llama_stack/templates/ollama/run.yaml         |   1 +
 llama_stack/templates/starter/run.yaml        | 170 ++----------------
 llama_stack/templates/starter/starter.py      |  58 +++++-
 10 files changed, 121 insertions(+), 173 deletions(-)

diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index dd73d93ea..4bc9b37e4 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -109,6 +109,18 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i
 
 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
 
+If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below:
+
+```yaml
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL:__disabled__}
+  provider_id: ollama
+  provider_model_id: ${env.INFERENCE_MODEL:__disabled__}
+```
+
+The snippet above will only register this model if the environment variable `INFERENCE_MODEL` is set and non-empty. If the environment variable is not set, the model will not get registered at all.
+
 ## Server Configuration
 
 The `server` section configures the HTTP server that serves the Llama Stack APIs:
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 5a9708497..b33b0d3f7 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -98,6 +98,15 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
 
         method = getattr(impls[api], register_method)
         for obj in objects:
+            # In complex templates, like our starter template, we may have dynamic model ids
+            # given by environment variables. This allows those environment variables to have
+            # a default value of __disabled__ to skip registration of the model if not set.
+            if (
+                hasattr(obj, "provider_model_id")
+                and obj.provider_model_id is not None
+                and "__disabled__" in obj.provider_model_id
+            ):
+                continue
             # we want to maintain the type information in arguments to method.
             # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
             # we use model_dump() to find all the attrs and then getattr to get the still typed value.
diff --git a/llama_stack/providers/remote/inference/ollama/__init__.py b/llama_stack/providers/remote/inference/ollama/__init__.py
index 073c31cde..491339451 100644
--- a/llama_stack/providers/remote/inference/ollama/__init__.py
+++ b/llama_stack/providers/remote/inference/ollama/__init__.py
@@ -10,6 +10,6 @@ from .config import OllamaImplConfig
 async def get_adapter_impl(config: OllamaImplConfig, _deps):
     from .ollama import OllamaInferenceAdapter
 
-    impl = OllamaInferenceAdapter(config.url)
+    impl = OllamaInferenceAdapter(config)
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py
index 0e4aef0e1..37b827f4f 100644
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@@ -13,7 +13,13 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"
 
 class OllamaImplConfig(BaseModel):
     url: str = DEFAULT_OLLAMA_URL
+    raise_on_connect_error: bool = True
 
     @classmethod
-    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]:
-        return {"url": url}
+    def sample_run_config(
+        cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs
+    ) -> dict[str, Any]:
+        return {
+            "url": url,
+            "raise_on_connect_error": raise_on_connect_error,
+        }
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index d51072fbf..2f51920b5 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -9,7 +9,6 @@ import uuid
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
 
-import httpx
 from ollama import AsyncClient  # type: ignore[attr-defined]
 from openai import AsyncOpenAI
 
@@ -57,6 +56,7 @@ from llama_stack.providers.datatypes import (
     HealthStatus,
     ModelsProtocolPrivate,
 )
+from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
 from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
 )
@@ -90,9 +90,10 @@ class OllamaInferenceAdapter(
     InferenceProvider,
     ModelsProtocolPrivate,
 ):
-    def __init__(self, url: str) -> None:
+    def __init__(self, config: OllamaImplConfig) -> None:
         self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
-        self.url = url
+        self.url = config.url
+        self.raise_on_connect_error = config.raise_on_connect_error
 
     @property
     def client(self) -> AsyncClient:
@@ -103,8 +104,13 @@ class OllamaInferenceAdapter(
         return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
 
     async def initialize(self) -> None:
-        logger.info(f"checking connectivity to Ollama at `{self.url}`...")
-        await self.health()
+        logger.debug(f"checking connectivity to Ollama at `{self.url}`...")
+        health_response = await self.health()
+        if health_response["status"] == HealthStatus.ERROR:
+            if self.raise_on_connect_error:
+                raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
+            else:
+                logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
 
     async def health(self) -> HealthResponse:
         """
@@ -117,10 +123,8 @@ class OllamaInferenceAdapter(
         try:
             await self.client.ps()
             return HealthResponse(status=HealthStatus.OK)
-        except httpx.ConnectError as e:
-            raise RuntimeError(
-                "Ollama Server is not running, start it using `ollama serve` in a separate terminal"
-            ) from e
+        except Exception as e:
+            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
 
     async def shutdown(self) -> None:
         pass
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 3424be6b4..ae04f206a 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
 
 import httpx
-from openai import AsyncOpenAI
+from openai import APIConnectionError, AsyncOpenAI
 from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
@@ -461,7 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             model = await self.register_helper.register_model(model)
         except ValueError:
             pass  # Ignore statically unknown model, will check live listing
-        res = await client.models.list()
+        try:
+            res = await client.models.list()
+        except APIConnectionError as e:
+            raise ValueError(
+                f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
+            ) from e
         available_models = [m.id async for m in res]
         if model.provider_resource_id not in available_models:
             raise ValueError(
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index 85d5c813b..2e1b7fdcc 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -18,6 +18,7 @@ providers:
     provider_type: remote::ollama
     config:
       url: ${env.OLLAMA_URL:http://localhost:11434}
+      raise_on_connect_error: true
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 2d10a99a4..8c2b17ef1 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -18,6 +18,7 @@ providers:
     provider_type: remote::ollama
     config:
       url: ${env.OLLAMA_URL:http://localhost:11434}
+      raise_on_connect_error: true
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml
index 960e96d01..30df39e5d 100644
--- a/llama_stack/templates/starter/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@@ -31,6 +31,7 @@ providers:
     provider_type: remote::ollama
     config:
       url: ${env.OLLAMA_URL:http://localhost:11434}
+      raise_on_connect_error: false
   - provider_id: anthropic
     provider_type: remote::anthropic
     config:
@@ -60,7 +61,14 @@ providers:
     provider_type: inline::sentence-transformers
     config: {}
   vector_io:
-  - provider_id: sqlite-vec
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/faiss_store.db
+  - provider_id: ${env.ENABLE_SQLITE_VEC+sqlite-vec}
     provider_type: inline::sqlite-vec
     config:
       db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db
@@ -530,160 +538,15 @@ models:
   provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
   model_type: llm
 - metadata: {}
-  model_id: ollama/llama3.1:8b-instruct-fp16
+  model_id: ollama/${env.OLLAMA_INFERENCE_MODEL:__disabled__}
   provider_id: ollama
-  provider_model_id: llama3.1:8b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.1-8B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.1:8b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.1:8b
-  provider_id: ollama
-  provider_model_id: llama3.1:8b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.1:70b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.1:70b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.1-70B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.1:70b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.1:70b
-  provider_id: ollama
-  provider_model_id: llama3.1:70b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.1:405b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.1:405b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: ollama
-  provider_model_id: llama3.1:405b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.1:405b
-  provider_id: ollama
-  provider_model_id: llama3.1:405b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2:1b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2:1b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-1B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2:1b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2:1b
-  provider_id: ollama
-  provider_model_id: llama3.2:1b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2:3b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2:3b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-3B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2:3b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2:3b
-  provider_id: ollama
-  provider_model_id: llama3.2:3b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2-vision:11b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:11b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:11b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2-vision:latest
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:latest
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2-vision:90b-instruct-fp16
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:90b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:90b-instruct-fp16
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.2-vision:90b
-  provider_id: ollama
-  provider_model_id: llama3.2-vision:90b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama3.3:70b
-  provider_id: ollama
-  provider_model_id: llama3.3:70b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-3.3-70B-Instruct
-  provider_id: ollama
-  provider_model_id: llama3.3:70b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama-guard3:8b
-  provider_id: ollama
-  provider_model_id: llama-guard3:8b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-Guard-3-8B
-  provider_id: ollama
-  provider_model_id: llama-guard3:8b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/llama-guard3:1b
-  provider_id: ollama
-  provider_model_id: llama-guard3:1b
-  model_type: llm
-- metadata: {}
-  model_id: ollama/meta-llama/Llama-Guard-3-1B
-  provider_id: ollama
-  provider_model_id: llama-guard3:1b
+  provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:__disabled__}
   model_type: llm
 - metadata:
-    embedding_dimension: 384
-    context_length: 512
-  model_id: ollama/all-minilm:latest
+    embedding_dimension: ${env.OLLAMA_EMBEDDING_DIMENSION:384}
+  model_id: ollama/${env.OLLAMA_EMBEDDING_MODEL:__disabled__}
   provider_id: ollama
-  provider_model_id: all-minilm:latest
-  model_type: embedding
-- metadata:
-    embedding_dimension: 384
-    context_length: 512
-  model_id: ollama/all-minilm
-  provider_id: ollama
-  provider_model_id: all-minilm:latest
-  model_type: embedding
-- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: ollama/nomic-embed-text
-  provider_id: ollama
-  provider_model_id: nomic-embed-text
+  provider_model_id: ${env.OLLAMA_EMBEDDING_MODEL:__disabled__}
   model_type: embedding
 - metadata: {}
   model_id: anthropic/claude-3-5-sonnet-latest
@@ -938,6 +801,11 @@ models:
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-Guard-3-8B
   model_type: llm
+- metadata: {}
+  model_id: vllm/${env.VLLM_INFERENCE_MODEL:__disabled__}
+  provider_id: vllm
+  provider_model_id: ${env.VLLM_INFERENCE_MODEL:__disabled__}
+  model_type: llm
 - metadata:
     embedding_dimension: 384
   model_id: all-MiniLM-L6-v2
diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py
index 2a44a0a37..ec01d08e9 100644
--- a/llama_stack/templates/starter/starter.py
+++ b/llama_stack/templates/starter/starter.py
@@ -16,6 +16,7 @@ from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplCo
 from llama_stack.providers.inline.inference.sentence_transformers import (
     SentenceTransformersInferenceConfig,
 )
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
     SQLiteVectorIOConfig,
 )
@@ -36,9 +37,6 @@ from llama_stack.providers.remote.inference.groq.models import (
     MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
 )
 from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
-from llama_stack.providers.remote.inference.ollama.models import (
-    MODEL_ENTRIES as OLLAMA_MODEL_ENTRIES,
-)
 from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
 from llama_stack.providers.remote.inference.openai.models import (
     MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
@@ -85,8 +83,22 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
         ),
         (
             "ollama",
-            OLLAMA_MODEL_ENTRIES,
-            OllamaImplConfig.sample_run_config(),
+            [
+                ProviderModelEntry(
+                    provider_model_id="${env.OLLAMA_INFERENCE_MODEL:__disabled__}",
+                    model_type=ModelType.llm,
+                ),
+                ProviderModelEntry(
+                    provider_model_id="${env.OLLAMA_EMBEDDING_MODEL:__disabled__}",
+                    model_type=ModelType.embedding,
+                    metadata={
+                        "embedding_dimension": "${env.OLLAMA_EMBEDDING_DIMENSION:384}",
+                    },
+                ),
+            ],
+            OllamaImplConfig.sample_run_config(
+                url="${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error=False
+            ),
         ),
         (
             "anthropic",
@@ -110,7 +122,12 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
         ),
         (
             "vllm",
-            [],
+            [
+                ProviderModelEntry(
+                    provider_model_id="${env.VLLM_INFERENCE_MODEL:__disabled__}",
+                    model_type=ModelType.llm,
+                ),
+            ],
             VLLMInferenceAdapterConfig.sample_run_config(
                 url="${env.VLLM_URL:http://localhost:8000/v1}",
             ),
@@ -153,7 +170,12 @@ def get_distribution_template() -> DistributionTemplate:
 
     vector_io_providers = [
         Provider(
-            provider_id="sqlite-vec",
+            provider_id="faiss",
+            provider_type="inline::faiss",
+            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_SQLITE_VEC+sqlite-vec}",
             provider_type="inline::sqlite-vec",
             config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
         ),
@@ -257,7 +279,27 @@ def get_distribution_template() -> DistributionTemplate:
             ),
             "VLLM_URL": (
                 "http://localhost:8000/v1",
-                "VLLM URL",
+                "vLLM URL",
+            ),
+            "VLLM_INFERENCE_MODEL": (
+                "",
+                "Optional vLLM Inference Model to register on startup",
+            ),
+            "OLLAMA_URL": (
+                "http://localhost:11434",
+                "Ollama URL",
+            ),
+            "OLLAMA_INFERENCE_MODEL": (
+                "",
+                "Optional Ollama Inference Model to register on startup",
+            ),
+            "OLLAMA_EMBEDDING_MODEL": (
+                "",
+                "Optional Ollama Embedding Model to register on startup",
+            ),
+            "OLLAMA_EMBEDDING_DIMENSION": (
+                "384",
+                "Ollama Embedding Dimension",
             ),
         },
     )