From 77d2c8e95d95625e506d7ae06105e213b69351d9 Mon Sep 17 00:00:00 2001
From: Sumanth Kamenani <skamenan@redhat.com>
Date: Mon, 14 Jul 2025 12:53:13 -0400
Subject: [PATCH 1/3] docs: clarify run.yaml files are starting points for
 customization (#2746)

# What does this PR do?
This PR improves documentation clarity around run.yaml file usage. It
adds comprehensive guidance to help users understand that generated
run.yaml files are templates meant to be customized for production use,
not used as-is.

## Changes
- Add new documentation section on customizing run.yaml files
- Clarify that generated run.yaml files are templates, not production
configs
- Add guidance on customization best practices and common scenarios
- Update existing documentation to reference customization guide
- Improve clarity around run.yaml file usage for better user experience

## Test Plan
- Verified new documentation file exists at correct location
- Confirmed documentation is properly integrated into the toctree
structure
- Checked all internal links use correct paths and reference existing
files
- Validated references are added to relevant existing documentation
files
- Documentation build testing will be handled by CI environment
---
 docs/source/distributions/building_distro.md  |  4 ++
 docs/source/distributions/configuration.md    |  4 ++
 .../distributions/customizing_run_yaml.md     | 40 +++++++++++++++++++
 docs/source/distributions/index.md            |  1 +
 .../getting_started/detailed_tutorial.md      |  2 +-
 5 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/distributions/customizing_run_yaml.md

diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index f24974dd3..cd2c6b6a8 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -145,6 +145,10 @@ $ llama stack build --template starter
 ...
 You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
 ```
+
+```{tip}
+The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+```
 :::
 :::{tab-item} Building from Scratch
 
diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index 4709cb8c6..9548780c6 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -2,6 +2,10 @@
 
 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
 
+```{note}
+The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
+```
+
 ```{dropdown} 👋 Click here for a Sample Configuration File
 
 ```yaml
diff --git a/docs/source/distributions/customizing_run_yaml.md b/docs/source/distributions/customizing_run_yaml.md
new file mode 100644
index 000000000..10067bab7
--- /dev/null
+++ b/docs/source/distributions/customizing_run_yaml.md
@@ -0,0 +1,40 @@
+# Customizing run.yaml Files
+
+The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
+
+## Key Points
+
+- **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
+- **Customization expected**: Update URLs, credentials, models, and settings for your environment
+- **Version control separately**: Keep customized configs in your own repository
+- **Environment-specific**: Create different configurations for dev, staging, production
+
+## What You Can Customize
+
+You can customize:
+- **Provider endpoints**: Change `http://localhost:8000` to your actual servers
+- **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
+- **Storage paths**: Move from `/tmp/` to production directories
+- **Authentication**: Add API keys, SSL, timeouts
+- **Models**: Different model sizes for dev vs prod
+- **Database settings**: Switch from SQLite to PostgreSQL
+- **Tool configurations**: Add custom tools and integrations
+
+## Best Practices
+
+- Use environment variables for secrets and environment-specific values
+- Create separate `run.yaml` files for different environments (dev, staging, prod)
+- Document your changes with comments
+- Test configurations before deployment
+- Keep your customized configs in version control
+
+Example structure:
+```
+your-project/
+├── configs/
+│   ├── dev-run.yaml
+│   ├── prod-run.yaml
+└── README.md
+```
+
+The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
\ No newline at end of file
diff --git a/docs/source/distributions/index.md b/docs/source/distributions/index.md
index 103a6131f..600eec3a1 100644
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@@ -9,6 +9,7 @@ This section provides an overview of the distributions available in Llama Stack.
 
 importing_as_library
 configuration
+customizing_run_yaml
 list_of_distributions
 kubernetes_deployment
 building_distro
diff --git a/docs/source/getting_started/detailed_tutorial.md b/docs/source/getting_started/detailed_tutorial.md
index 35cb7f02e..97e7df774 100644
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@@ -54,7 +54,7 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
 You can use Python to build and run the Llama Stack server, which is useful for testing and development.
 
 Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
-which defines the providers and their settings.
+which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml.md).
 Now let's build and run the Llama Stack config for Ollama.
 We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
 

From a7ed86181c8b823ca13af0c911be694e49899be9 Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Mon, 14 Jul 2025 18:58:23 +0100
Subject: [PATCH 2/3] fix(faiss): Delete file contents from kvstore (#2686)

Remove both the metadata and content from the kvstore when a file is
being removed from the vector store.

Closes: #2685

Also add faiss provider to openai_vector_stores test suite

---------

Signed-off-by: Derek Higgins <derekh@redhat.com>
Co-authored-by: raghotham <rsm@meta.com>
---
 .../providers/inline/vector_io/faiss/faiss.py | 21 +++++++--
 tests/unit/providers/vector_io/conftest.py    | 44 ++++++++++++++++++-
 .../test_vector_io_openai_vector_stores.py    |  6 ++-
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index 62a98413d..0306d9156 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -267,6 +267,7 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
         assert self.kvstore is not None
         key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
         await self.kvstore.set(key=key, value=json.dumps(store_info))
+        self.openai_vector_stores[store_id] = store_info
 
     async def _load_openai_vector_stores(self) -> dict[str, dict[str, Any]]:
         """Load all vector store metadata from kvstore."""
@@ -286,17 +287,20 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
         assert self.kvstore is not None
         key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
         await self.kvstore.set(key=key, value=json.dumps(store_info))
+        self.openai_vector_stores[store_id] = store_info
 
     async def _delete_openai_vector_store_from_storage(self, store_id: str) -> None:
         """Delete vector store metadata from kvstore."""
         assert self.kvstore is not None
         key = f"{OPENAI_VECTOR_STORES_PREFIX}{store_id}"
         await self.kvstore.delete(key)
+        if store_id in self.openai_vector_stores:
+            del self.openai_vector_stores[store_id]
 
     async def _save_openai_vector_store_file(
         self, store_id: str, file_id: str, file_info: dict[str, Any], file_contents: list[dict[str, Any]]
     ) -> None:
-        """Save vector store file metadata to kvstore."""
+        """Save vector store file data to kvstore."""
         assert self.kvstore is not None
         key = f"{OPENAI_VECTOR_STORES_FILES_PREFIX}{store_id}:{file_id}"
         await self.kvstore.set(key=key, value=json.dumps(file_info))
@@ -324,7 +328,16 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
         await self.kvstore.set(key=key, value=json.dumps(file_info))
 
     async def _delete_openai_vector_store_file_from_storage(self, store_id: str, file_id: str) -> None:
-        """Delete vector store file metadata from kvstore."""
+        """Delete vector store data from kvstore."""
         assert self.kvstore is not None
-        key = f"{OPENAI_VECTOR_STORES_FILES_PREFIX}{store_id}:{file_id}"
-        await self.kvstore.delete(key)
+
+        keys_to_delete = [
+            f"{OPENAI_VECTOR_STORES_FILES_PREFIX}{store_id}:{file_id}",
+            f"{OPENAI_VECTOR_STORES_FILES_CONTENTS_PREFIX}{store_id}:{file_id}",
+        ]
+        for key in keys_to_delete:
+            try:
+                await self.kvstore.delete(key)
+            except Exception as e:
+                logger.warning(f"Failed to delete key {key}: {e}")
+                continue
diff --git a/tests/unit/providers/vector_io/conftest.py b/tests/unit/providers/vector_io/conftest.py
index 4a9639326..9f86f877d 100644
--- a/tests/unit/providers/vector_io/conftest.py
+++ b/tests/unit/providers/vector_io/conftest.py
@@ -12,6 +12,8 @@ from pymilvus import MilvusClient, connections
 
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, ChunkMetadata
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack.providers.inline.vector_io.faiss.faiss import FaissIndex, FaissVectorIOAdapter
 from llama_stack.providers.inline.vector_io.milvus.config import MilvusVectorIOConfig, SqliteKVStoreConfig
 from llama_stack.providers.inline.vector_io.sqlite_vec import SQLiteVectorIOConfig
 from llama_stack.providers.inline.vector_io.sqlite_vec.sqlite_vec import SQLiteVecIndex, SQLiteVecVectorIOAdapter
@@ -90,7 +92,7 @@ def sample_embeddings_with_metadata(sample_chunks_with_metadata):
     return np.array([np.random.rand(EMBEDDING_DIMENSION).astype(np.float32) for _ in sample_chunks_with_metadata])
 
 
-@pytest.fixture(params=["milvus", "sqlite_vec"])
+@pytest.fixture(params=["milvus", "sqlite_vec", "faiss"])
 def vector_provider(request):
     return request.param
 
@@ -116,7 +118,7 @@ async def unique_kvstore_config(tmp_path_factory):
 
 @pytest.fixture(scope="session")
 def sqlite_vec_db_path(tmp_path_factory):
-    db_path = str(tmp_path_factory.getbasetemp() / "test.db")
+    db_path = str(tmp_path_factory.getbasetemp() / "test_sqlite_vec.db")
     return db_path
 
 
@@ -198,11 +200,49 @@ async def milvus_vec_adapter(milvus_vec_db_path, mock_inference_api):
     await adapter.shutdown()
 
 
+@pytest.fixture
+def faiss_vec_db_path(tmp_path_factory):
+    db_path = str(tmp_path_factory.getbasetemp() / "test_faiss.db")
+    return db_path
+
+
+@pytest.fixture
+async def faiss_vec_index(embedding_dimension):
+    index = FaissIndex(embedding_dimension)
+    yield index
+    await index.delete()
+
+
+@pytest.fixture
+async def faiss_vec_adapter(unique_kvstore_config, mock_inference_api, embedding_dimension):
+    config = FaissVectorIOConfig(
+        kvstore=unique_kvstore_config,
+    )
+    adapter = FaissVectorIOAdapter(
+        config=config,
+        inference_api=mock_inference_api,
+        files_api=None,
+    )
+    await adapter.initialize()
+    await adapter.register_vector_db(
+        VectorDB(
+            identifier=f"faiss_test_collection_{np.random.randint(1e6)}",
+            provider_id="test_provider",
+            embedding_model="test_model",
+            embedding_dimension=embedding_dimension,
+        )
+    )
+    yield adapter
+    await adapter.shutdown()
+
+
 @pytest.fixture
 def vector_io_adapter(vector_provider, request):
     """Returns the appropriate vector IO adapter based on the provider parameter."""
     if vector_provider == "milvus":
         return request.getfixturevalue("milvus_vec_adapter")
+    elif vector_provider == "faiss":
+        return request.getfixturevalue("faiss_vec_adapter")
     else:
         return request.getfixturevalue("sqlite_vec_adapter")
 
diff --git a/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py b/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
index 97e2f085e..bf7663d2e 100644
--- a/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
+++ b/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
@@ -94,7 +94,7 @@ async def test_query_unregistered_raises(vector_io_adapter):
 
 async def test_insert_chunks_calls_underlying_index(vector_io_adapter):
     fake_index = AsyncMock()
-    vector_io_adapter._get_and_cache_vector_db_index = AsyncMock(return_value=fake_index)
+    vector_io_adapter.cache["db1"] = fake_index
 
     chunks = ["chunk1", "chunk2"]
     await vector_io_adapter.insert_chunks("db1", chunks)
@@ -112,7 +112,7 @@ async def test_insert_chunks_missing_db_raises(vector_io_adapter):
 async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
     expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
     fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))
-    vector_io_adapter._get_and_cache_vector_db_index = AsyncMock(return_value=fake_index)
+    vector_io_adapter.cache["db1"] = fake_index
 
     response = await vector_io_adapter.query_chunks("db1", "my_query", {"param": 1})
 
@@ -286,5 +286,7 @@ async def test_delete_openai_vector_store_file_from_storage(vector_io_adapter, t
     await vector_io_adapter._save_openai_vector_store_file(store_id, file_id, file_info, file_contents)
     await vector_io_adapter._delete_openai_vector_store_file_from_storage(store_id, file_id)
 
+    loaded_file_info = await vector_io_adapter._load_openai_vector_store_file(store_id, file_id)
+    assert loaded_file_info == {}
     loaded_contents = await vector_io_adapter._load_openai_vector_store_file_contents(store_id, file_id)
     assert loaded_contents == []

From f731f369a2be1ad1d28ff8b3e1b9f59ae261451e Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Mon, 14 Jul 2025 14:38:53 -0400
Subject: [PATCH 3/3] feat: add infrastructure to allow inference model
 discovery (#2710)

# What does this PR do?

inference providers each have a static list of supported / known models.
some also have access to a dynamic list of currently available models.
this change gives prodivers using the ModelRegistryHelper the ability to
combine their static and dynamic lists.

for instance, OpenAIInferenceAdapter can implement
```
   def query_available_models(self) -> list[str]:
      return [entry.model for entry in self.openai_client.models.list()]
```
to augment its static list w/ a current list from openai.

## Test Plan

scripts/unit-test.sh
---
 .../utils/inference/model_registry.py         | 33 ++++++-
 .../providers/utils/test_model_registry.py    | 91 +++++++++++++++++++
 2 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py
index c2fc13e07..801b8ea06 100644
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@@ -83,9 +83,37 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
     def get_llama_model(self, provider_model_id: str) -> str | None:
         return self.provider_id_to_llama_model_map.get(provider_model_id, None)
 
+    async def check_model_availability(self, model: str) -> bool:
+        """
+        Check if a specific model is available from the provider (non-static check).
+
+        This is for subclassing purposes, so providers can check if a specific
+        model is currently available for use through dynamic means (e.g., API calls).
+
+        This method should NOT check statically configured model entries in
+        `self.alias_to_provider_id_map` - that is handled separately in register_model.
+
+        Default implementation returns False (no dynamic models available).
+
+        :param model: The model identifier to check.
+        :return: True if the model is available dynamically, False otherwise.
+        """
+        return False
+
     async def register_model(self, model: Model) -> Model:
-        if not (supported_model_id := self.get_provider_model_id(model.provider_resource_id)):
-            raise UnsupportedModelError(model.provider_resource_id, self.alias_to_provider_id_map.keys())
+        # Check if model is supported in static configuration
+        supported_model_id = self.get_provider_model_id(model.provider_resource_id)
+
+        # If not found in static config, check if it's available dynamically from provider
+        if not supported_model_id:
+            if await self.check_model_availability(model.provider_resource_id):
+                supported_model_id = model.provider_resource_id
+            else:
+                # note: we cannot provide a complete list of supported models without
+                #       getting a complete list from the provider, so we return "..."
+                all_supported_models = [*self.alias_to_provider_id_map.keys(), "..."]
+                raise UnsupportedModelError(model.provider_resource_id, all_supported_models)
+
         provider_resource_id = self.get_provider_model_id(model.model_id)
         if model.model_type == ModelType.embedding:
             # embedding models are always registered by their provider model id and does not need to be mapped to a llama model
@@ -114,6 +142,7 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
                         ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR[llama_model]
                     )
 
+        # Register the model alias, ensuring it maps to the correct provider model id
         self.alias_to_provider_id_map[model.model_id] = supported_model_id
 
         return model
diff --git a/tests/unit/providers/utils/test_model_registry.py b/tests/unit/providers/utils/test_model_registry.py
index e11f95d49..1a1705961 100644
--- a/tests/unit/providers/utils/test_model_registry.py
+++ b/tests/unit/providers/utils/test_model_registry.py
@@ -87,6 +87,37 @@ def helper(known_provider_model: ProviderModelEntry, known_provider_model2: Prov
     return ModelRegistryHelper([known_provider_model, known_provider_model2])
 
 
+class MockModelRegistryHelperWithDynamicModels(ModelRegistryHelper):
+    """Test helper that simulates a provider with dynamically available models."""
+
+    def __init__(self, model_entries: list[ProviderModelEntry], available_models: list[str]):
+        super().__init__(model_entries)
+        self._available_models = available_models
+
+    async def check_model_availability(self, model: str) -> bool:
+        return model in self._available_models
+
+
+@pytest.fixture
+def dynamic_model() -> Model:
+    """A model that's not in static config but available dynamically."""
+    return Model(
+        provider_id="provider",
+        identifier="dynamic-model",
+        provider_resource_id="dynamic-provider-id",
+    )
+
+
+@pytest.fixture
+def helper_with_dynamic_models(
+    known_provider_model: ProviderModelEntry, known_provider_model2: ProviderModelEntry, dynamic_model: Model
+) -> MockModelRegistryHelperWithDynamicModels:
+    """Helper that includes dynamically available models."""
+    return MockModelRegistryHelperWithDynamicModels(
+        [known_provider_model, known_provider_model2], [dynamic_model.provider_resource_id]
+    )
+
+
 async def test_lookup_unknown_model(helper: ModelRegistryHelper, unknown_model: Model) -> None:
     assert helper.get_provider_model_id(unknown_model.model_id) is None
 
@@ -151,3 +182,63 @@ async def test_unregister_model_during_init(helper: ModelRegistryHelper, known_m
     assert helper.get_provider_model_id(known_model.provider_resource_id) == known_model.provider_model_id
     await helper.unregister_model(known_model.provider_resource_id)
     assert helper.get_provider_model_id(known_model.provider_resource_id) is None
+
+
+async def test_register_model_from_check_model_availability(
+    helper_with_dynamic_models: MockModelRegistryHelperWithDynamicModels, dynamic_model: Model
+) -> None:
+    """Test that models returned by check_model_availability can be registered."""
+    # Verify the model is not in static config
+    assert helper_with_dynamic_models.get_provider_model_id(dynamic_model.provider_resource_id) is None
+
+    # But it should be available via check_model_availability
+    is_available = await helper_with_dynamic_models.check_model_availability(dynamic_model.provider_resource_id)
+    assert is_available
+
+    # Registration should succeed
+    registered_model = await helper_with_dynamic_models.register_model(dynamic_model)
+    assert registered_model == dynamic_model
+
+    # Model should now be registered and accessible
+    assert (
+        helper_with_dynamic_models.get_provider_model_id(dynamic_model.model_id) == dynamic_model.provider_resource_id
+    )
+
+
+async def test_register_model_not_in_static_or_dynamic(
+    helper_with_dynamic_models: MockModelRegistryHelperWithDynamicModels, unknown_model: Model
+) -> None:
+    """Test that models not in static config or dynamic models are rejected."""
+    # Verify the model is not in static config
+    assert helper_with_dynamic_models.get_provider_model_id(unknown_model.provider_resource_id) is None
+
+    # And not available via check_model_availability
+    is_available = await helper_with_dynamic_models.check_model_availability(unknown_model.provider_resource_id)
+    assert not is_available
+
+    # Registration should fail with comprehensive error message
+    with pytest.raises(Exception) as exc_info:  # UnsupportedModelError
+        await helper_with_dynamic_models.register_model(unknown_model)
+
+    # Error should include static models and "..." for dynamic models
+    error_str = str(exc_info.value)
+    assert "..." in error_str  # "..." should be in error message
+
+
+async def test_register_alias_for_dynamic_model(
+    helper_with_dynamic_models: MockModelRegistryHelperWithDynamicModels, dynamic_model: Model
+) -> None:
+    """Test that we can register an alias that maps to a dynamically available model."""
+    # Create a model with a different identifier but same provider_resource_id
+    alias_model = Model(
+        provider_id=dynamic_model.provider_id,
+        identifier="dynamic-model-alias",
+        provider_resource_id=dynamic_model.provider_resource_id,
+    )
+
+    # Registration should succeed since the provider_resource_id is available dynamically
+    registered_model = await helper_with_dynamic_models.register_model(alias_model)
+    assert registered_model == alias_model
+
+    # Both the original provider_resource_id and the new alias should work
+    assert helper_with_dynamic_models.get_provider_model_id(alias_model.model_id) == dynamic_model.provider_resource_id