diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py
index a1bfcac00..aabe78d85 100644
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@@ -7,7 +7,7 @@
 from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
 
 from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
 
 from llama_stack.apis.resource import Resource, ResourceType
 
@@ -37,6 +37,8 @@ class ModelInput(CommonModelFields):
     provider_id: Optional[str] = None
     provider_model_id: Optional[str] = None
 
+    model_config = ConfigDict(protected_namespaces=())
+
 
 @runtime_checkable
 class Models(Protocol):
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index 842703d4c..5fce8c92c 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -48,7 +48,10 @@ class StackRun(Subcommand):
 
         from llama_stack.distribution.build import ImageType
         from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-        from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
+        from llama_stack.distribution.utils.config_dirs import (
+            BUILDS_BASE_DIR,
+            DISTRIBS_BASE_DIR,
+        )
         from llama_stack.distribution.utils.exec import run_with_pty
 
         if not args.config:
@@ -68,6 +71,14 @@ class StackRun(Subcommand):
                 BUILDS_BASE_DIR / ImageType.docker.value / f"{args.config}-run.yaml"
             )
 
+        if not config_file.exists() and not args.config.endswith(".yaml"):
+            # check if it's a build config saved to ~/.llama dir
+            config_file = Path(
+                DISTRIBS_BASE_DIR
+                / f"llamastack-{args.config}"
+                / f"{args.config}-run.yaml"
+            )
+
         if not config_file.exists():
             self.parser.error(
                 f"File {str(config_file)} does not exist. Please run `llama stack build` to generate (and optionally edit) a run.yaml file"
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 5796b6c68..0cfd11eda 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -369,12 +369,16 @@ def main(
 
             impl_method = getattr(impl, endpoint.name)
 
-            getattr(app, endpoint.method)(endpoint.route, response_model=None)(
-                create_dynamic_typed_route(
-                    impl_method,
-                    endpoint.method,
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore", category=UserWarning, module="pydantic._internal._fields"
+                )
+                getattr(app, endpoint.method)(endpoint.route, response_model=None)(
+                    create_dynamic_typed_route(
+                        impl_method,
+                        endpoint.method,
+                    )
                 )
-            )
 
         cprint(f"Serving API {api_str}", "white", attrs=["bold"])
         for endpoint in endpoints:
diff --git a/llama_stack/providers/inline/memory/faiss/faiss.py b/llama_stack/providers/inline/memory/faiss/faiss.py
index 92235ea89..95791bc69 100644
--- a/llama_stack/providers/inline/memory/faiss/faiss.py
+++ b/llama_stack/providers/inline/memory/faiss/faiss.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 import base64
+import io
 import json
 import logging
 
@@ -45,7 +46,12 @@ class FaissIndex(EmbeddingIndex):
         self.chunk_by_index = {}
         self.kvstore = kvstore
         self.bank_id = bank_id
-        self.initialize()
+
+    @classmethod
+    async def create(cls, dimension: int, kvstore=None, bank_id: str = None):
+        instance = cls(dimension, kvstore, bank_id)
+        await instance.initialize()
+        return instance
 
     async def initialize(self) -> None:
         if not self.kvstore:
@@ -62,19 +68,20 @@ class FaissIndex(EmbeddingIndex):
                 for k, v in data["chunk_by_index"].items()
             }
 
-            index_bytes = base64.b64decode(data["faiss_index"])
-            self.index = faiss.deserialize_index(index_bytes)
+            buffer = io.BytesIO(base64.b64decode(data["faiss_index"]))
+            self.index = faiss.deserialize_index(np.loadtxt(buffer, dtype=np.uint8))
 
     async def _save_index(self):
         if not self.kvstore or not self.bank_id:
             return
 
-        index_bytes = faiss.serialize_index(self.index)
-
+        np_index = faiss.serialize_index(self.index)
+        buffer = io.BytesIO()
+        np.savetxt(buffer, np_index)
         data = {
             "id_by_index": self.id_by_index,
             "chunk_by_index": {k: v.json() for k, v in self.chunk_by_index.items()},
-            "faiss_index": base64.b64encode(index_bytes).decode(),
+            "faiss_index": base64.b64encode(buffer.getvalue()).decode("utf-8"),
         }
 
         index_key = f"faiss_index:v1::{self.bank_id}"
@@ -132,7 +139,10 @@ class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate):
         for bank_data in stored_banks:
             bank = VectorMemoryBank.model_validate_json(bank_data)
             index = BankWithIndex(
-                bank=bank, index=FaissIndex(ALL_MINILM_L6_V2_DIMENSION, self.kvstore)
+                bank=bank,
+                index=await FaissIndex.create(
+                    ALL_MINILM_L6_V2_DIMENSION, self.kvstore, bank.identifier
+                ),
             )
             self.cache[bank.identifier] = index
 
@@ -158,7 +168,9 @@ class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate):
         # Store in cache
         index = BankWithIndex(
             bank=memory_bank,
-            index=FaissIndex(ALL_MINILM_L6_V2_DIMENSION, self.kvstore),
+            index=await FaissIndex.create(
+                ALL_MINILM_L6_V2_DIMENSION, self.kvstore, memory_bank.identifier
+            ),
         )
         self.cache[memory_bank.identifier] = index
 
@@ -178,7 +190,7 @@ class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate):
     ) -> None:
         index = self.cache.get(bank_id)
         if index is None:
-            raise ValueError(f"Bank {bank_id} not found")
+            raise ValueError(f"Bank {bank_id} not found. found: {self.cache.keys()}")
 
         await index.insert_documents(documents)
 
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index cb1a3dd36..c8d061f6c 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -157,7 +157,7 @@ def available_providers() -> List[ProviderSpec]:
                 pip_packages=[
                     "openai",
                 ],
-                module="llama_stack.providers.adapters.inference.nvidia",
+                module="llama_stack.providers.remote.inference.nvidia",
                 config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
             ),
         ),
diff --git a/llama_stack/providers/remote/inference/nvidia/_nvidia.py b/llama_stack/providers/remote/inference/nvidia/_nvidia.py
index e4d1aa030..e5667b728 100644
--- a/llama_stack/providers/remote/inference/nvidia/_nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/_nvidia.py
@@ -84,7 +84,7 @@ _MODEL_ALIASES = [
 ]
 
 
-class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference):
+class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
     def __init__(self, config: NVIDIAConfig) -> None:
         # TODO(mf): filter by available models
         ModelRegistryHelper.__init__(self, model_aliases=_MODEL_ALIASES)
@@ -117,7 +117,7 @@ class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference):
 
     def completion(
         self,
-        model: str,
+        model_id: str,
         content: InterleavedTextMedia,
         sampling_params: Optional[SamplingParams] = SamplingParams(),
         response_format: Optional[ResponseFormat] = None,
@@ -128,14 +128,14 @@ class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference):
 
     async def embeddings(
         self,
-        model: str,
+        model_id: str,
         contents: List[InterleavedTextMedia],
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
 
     async def chat_completion(
         self,
-        model: str,
+        model_id: str,
         messages: List[Message],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
         response_format: Optional[ResponseFormat] = None,
@@ -156,7 +156,7 @@ class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference):
 
         request = convert_chat_completion_request(
             request=ChatCompletionRequest(
-                model=self.get_provider_model_id(model),
+                model=self.get_provider_model_id(model_id),
                 messages=messages,
                 sampling_params=sampling_params,
                 tools=tools,