diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index a1bfcac00..aabe78d85 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from llama_stack.apis.resource import Resource, ResourceType @@ -37,6 +37,8 @@ class ModelInput(CommonModelFields): provider_id: Optional[str] = None provider_model_id: Optional[str] = None + model_config = ConfigDict(protected_namespaces=()) + @runtime_checkable class Models(Protocol): diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py index 842703d4c..5fce8c92c 100644 --- a/llama_stack/cli/stack/run.py +++ b/llama_stack/cli/stack/run.py @@ -48,7 +48,10 @@ class StackRun(Subcommand): from llama_stack.distribution.build import ImageType from llama_stack.distribution.configure import parse_and_maybe_upgrade_config - from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR + from llama_stack.distribution.utils.config_dirs import ( + BUILDS_BASE_DIR, + DISTRIBS_BASE_DIR, + ) from llama_stack.distribution.utils.exec import run_with_pty if not args.config: @@ -68,6 +71,14 @@ class StackRun(Subcommand): BUILDS_BASE_DIR / ImageType.docker.value / f"{args.config}-run.yaml" ) + if not config_file.exists() and not args.config.endswith(".yaml"): + # check if it's a build config saved to ~/.llama dir + config_file = Path( + DISTRIBS_BASE_DIR + / f"llamastack-{args.config}" + / f"{args.config}-run.yaml" + ) + if not config_file.exists(): self.parser.error( f"File {str(config_file)} does not exist. Please run `llama stack build` to generate (and optionally edit) a run.yaml file" diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 5796b6c68..0cfd11eda 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -369,12 +369,16 @@ def main( impl_method = getattr(impl, endpoint.name) - getattr(app, endpoint.method)(endpoint.route, response_model=None)( - create_dynamic_typed_route( - impl_method, - endpoint.method, + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", category=UserWarning, module="pydantic._internal._fields" + ) + getattr(app, endpoint.method)(endpoint.route, response_model=None)( + create_dynamic_typed_route( + impl_method, + endpoint.method, + ) ) - ) cprint(f"Serving API {api_str}", "white", attrs=["bold"]) for endpoint in endpoints: diff --git a/llama_stack/providers/inline/memory/faiss/faiss.py b/llama_stack/providers/inline/memory/faiss/faiss.py index 92235ea89..95791bc69 100644 --- a/llama_stack/providers/inline/memory/faiss/faiss.py +++ b/llama_stack/providers/inline/memory/faiss/faiss.py @@ -5,6 +5,7 @@ # the root directory of this source tree. import base64 +import io import json import logging @@ -45,7 +46,12 @@ class FaissIndex(EmbeddingIndex): self.chunk_by_index = {} self.kvstore = kvstore self.bank_id = bank_id - self.initialize() + + @classmethod + async def create(cls, dimension: int, kvstore=None, bank_id: str = None): + instance = cls(dimension, kvstore, bank_id) + await instance.initialize() + return instance async def initialize(self) -> None: if not self.kvstore: @@ -62,19 +68,20 @@ class FaissIndex(EmbeddingIndex): for k, v in data["chunk_by_index"].items() } - index_bytes = base64.b64decode(data["faiss_index"]) - self.index = faiss.deserialize_index(index_bytes) + buffer = io.BytesIO(base64.b64decode(data["faiss_index"])) + self.index = faiss.deserialize_index(np.loadtxt(buffer, dtype=np.uint8)) async def _save_index(self): if not self.kvstore or not self.bank_id: return - index_bytes = faiss.serialize_index(self.index) - + np_index = faiss.serialize_index(self.index) + buffer = io.BytesIO() + np.savetxt(buffer, np_index) data = { "id_by_index": self.id_by_index, "chunk_by_index": {k: v.json() for k, v in self.chunk_by_index.items()}, - "faiss_index": base64.b64encode(index_bytes).decode(), + "faiss_index": base64.b64encode(buffer.getvalue()).decode("utf-8"), } index_key = f"faiss_index:v1::{self.bank_id}" @@ -132,7 +139,10 @@ class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate): for bank_data in stored_banks: bank = VectorMemoryBank.model_validate_json(bank_data) index = BankWithIndex( - bank=bank, index=FaissIndex(ALL_MINILM_L6_V2_DIMENSION, self.kvstore) + bank=bank, + index=await FaissIndex.create( + ALL_MINILM_L6_V2_DIMENSION, self.kvstore, bank.identifier + ), ) self.cache[bank.identifier] = index @@ -158,7 +168,9 @@ class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate): # Store in cache index = BankWithIndex( bank=memory_bank, - index=FaissIndex(ALL_MINILM_L6_V2_DIMENSION, self.kvstore), + index=await FaissIndex.create( + ALL_MINILM_L6_V2_DIMENSION, self.kvstore, memory_bank.identifier + ), ) self.cache[memory_bank.identifier] = index @@ -178,7 +190,7 @@ class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate): ) -> None: index = self.cache.get(bank_id) if index is None: - raise ValueError(f"Bank {bank_id} not found") + raise ValueError(f"Bank {bank_id} not found. found: {self.cache.keys()}") await index.insert_documents(documents) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index cb1a3dd36..c8d061f6c 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -157,7 +157,7 @@ def available_providers() -> List[ProviderSpec]: pip_packages=[ "openai", ], - module="llama_stack.providers.adapters.inference.nvidia", + module="llama_stack.providers.remote.inference.nvidia", config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig", ), ), diff --git a/llama_stack/providers/remote/inference/nvidia/_nvidia.py b/llama_stack/providers/remote/inference/nvidia/_nvidia.py index e4d1aa030..e5667b728 100644 --- a/llama_stack/providers/remote/inference/nvidia/_nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/_nvidia.py @@ -84,7 +84,7 @@ _MODEL_ALIASES = [ ] -class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference): +class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): def __init__(self, config: NVIDIAConfig) -> None: # TODO(mf): filter by available models ModelRegistryHelper.__init__(self, model_aliases=_MODEL_ALIASES) @@ -117,7 +117,7 @@ class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference): def completion( self, - model: str, + model_id: str, content: InterleavedTextMedia, sampling_params: Optional[SamplingParams] = SamplingParams(), response_format: Optional[ResponseFormat] = None, @@ -128,14 +128,14 @@ class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference): async def embeddings( self, - model: str, + model_id: str, contents: List[InterleavedTextMedia], ) -> EmbeddingsResponse: raise NotImplementedError() async def chat_completion( self, - model: str, + model_id: str, messages: List[Message], sampling_params: Optional[SamplingParams] = SamplingParams(), response_format: Optional[ResponseFormat] = None, @@ -156,7 +156,7 @@ class NVIDIAInferenceAdapter(ModelRegistryHelper, Inference): request = convert_chat_completion_request( request=ChatCompletionRequest( - model=self.get_provider_model_id(model), + model=self.get_provider_model_id(model_id), messages=messages, sampling_params=sampling_params, tools=tools,