diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md index dd73d93ea..4bc9b37e4 100644 --- a/docs/source/distributions/configuration.md +++ b/docs/source/distributions/configuration.md @@ -109,6 +109,18 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`. +If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below: + +```yaml +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL:__disabled__} + provider_id: ollama + provider_model_id: ${env.INFERENCE_MODEL:__disabled__} +``` + +The snippet above will only register this model if the environment variable `INFERENCE_MODEL` is set and non-empty. If the environment variable is not set, the model will not get registered at all. + ## Server Configuration The `server` section configures the HTTP server that serves the Llama Stack APIs: diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 5a9708497..b33b0d3f7 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -98,6 +98,15 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]): method = getattr(impls[api], register_method) for obj in objects: + # In complex templates, like our starter template, we may have dynamic model ids + # given by environment variables. This allows those environment variables to have + # a default value of __disabled__ to skip registration of the model if not set. + if ( + hasattr(obj, "provider_model_id") + and obj.provider_model_id is not None + and "__disabled__" in obj.provider_model_id + ): + continue # we want to maintain the type information in arguments to method. # instead of method(**obj.model_dump()), which may convert a typed attr to a dict, # we use model_dump() to find all the attrs and then getattr to get the still typed value. diff --git a/llama_stack/providers/remote/inference/ollama/__init__.py b/llama_stack/providers/remote/inference/ollama/__init__.py index 073c31cde..491339451 100644 --- a/llama_stack/providers/remote/inference/ollama/__init__.py +++ b/llama_stack/providers/remote/inference/ollama/__init__.py @@ -10,6 +10,6 @@ from .config import OllamaImplConfig async def get_adapter_impl(config: OllamaImplConfig, _deps): from .ollama import OllamaInferenceAdapter - impl = OllamaInferenceAdapter(config.url) + impl = OllamaInferenceAdapter(config) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py index 0e4aef0e1..37b827f4f 100644 --- a/llama_stack/providers/remote/inference/ollama/config.py +++ b/llama_stack/providers/remote/inference/ollama/config.py @@ -13,7 +13,13 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434" class OllamaImplConfig(BaseModel): url: str = DEFAULT_OLLAMA_URL + raise_on_connect_error: bool = True @classmethod - def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]: - return {"url": url} + def sample_run_config( + cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs + ) -> dict[str, Any]: + return { + "url": url, + "raise_on_connect_error": raise_on_connect_error, + } diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index d51072fbf..2f51920b5 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -9,7 +9,6 @@ import uuid from collections.abc import AsyncGenerator, AsyncIterator from typing import Any -import httpx from ollama import AsyncClient # type: ignore[attr-defined] from openai import AsyncOpenAI @@ -57,6 +56,7 @@ from llama_stack.providers.datatypes import ( HealthStatus, ModelsProtocolPrivate, ) +from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) @@ -90,9 +90,10 @@ class OllamaInferenceAdapter( InferenceProvider, ModelsProtocolPrivate, ): - def __init__(self, url: str) -> None: + def __init__(self, config: OllamaImplConfig) -> None: self.register_helper = ModelRegistryHelper(MODEL_ENTRIES) - self.url = url + self.url = config.url + self.raise_on_connect_error = config.raise_on_connect_error @property def client(self) -> AsyncClient: @@ -103,8 +104,13 @@ class OllamaInferenceAdapter( return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama") async def initialize(self) -> None: - logger.info(f"checking connectivity to Ollama at `{self.url}`...") - await self.health() + logger.debug(f"checking connectivity to Ollama at `{self.url}`...") + health_response = await self.health() + if health_response["status"] == HealthStatus.ERROR: + if self.raise_on_connect_error: + raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal") + else: + logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal") async def health(self) -> HealthResponse: """ @@ -117,10 +123,8 @@ class OllamaInferenceAdapter( try: await self.client.ps() return HealthResponse(status=HealthStatus.OK) - except httpx.ConnectError as e: - raise RuntimeError( - "Ollama Server is not running, start it using `ollama serve` in a separate terminal" - ) from e + except Exception as e: + return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}") async def shutdown(self) -> None: pass diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 3424be6b4..ae04f206a 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator, AsyncIterator from typing import Any import httpx -from openai import AsyncOpenAI +from openai import APIConnectionError, AsyncOpenAI from openai.types.chat.chat_completion_chunk import ( ChatCompletionChunk as OpenAIChatCompletionChunk, ) @@ -461,7 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): model = await self.register_helper.register_model(model) except ValueError: pass # Ignore statically unknown model, will check live listing - res = await client.models.list() + try: + res = await client.models.list() + except APIConnectionError as e: + raise ValueError( + f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL." + ) from e available_models = [m.id async for m in res] if model.provider_resource_id not in available_models: raise ValueError( diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 85d5c813b..2e1b7fdcc 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -18,6 +18,7 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + raise_on_connect_error: true vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 2d10a99a4..8c2b17ef1 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -18,6 +18,7 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + raise_on_connect_error: true vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml index 960e96d01..30df39e5d 100644 --- a/llama_stack/templates/starter/run.yaml +++ b/llama_stack/templates/starter/run.yaml @@ -31,6 +31,7 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + raise_on_connect_error: false - provider_id: anthropic provider_type: remote::anthropic config: @@ -60,7 +61,14 @@ providers: provider_type: inline::sentence-transformers config: {} vector_io: - - provider_id: sqlite-vec + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/faiss_store.db + - provider_id: ${env.ENABLE_SQLITE_VEC+sqlite-vec} provider_type: inline::sqlite-vec config: db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db @@ -530,160 +538,15 @@ models: provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 model_type: llm - metadata: {} - model_id: ollama/llama3.1:8b-instruct-fp16 + model_id: ollama/${env.OLLAMA_INFERENCE_MODEL:__disabled__} provider_id: ollama - provider_model_id: llama3.1:8b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.1-8B-Instruct - provider_id: ollama - provider_model_id: llama3.1:8b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:8b - provider_id: ollama - provider_model_id: llama3.1:8b - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:70b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.1:70b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.1-70B-Instruct - provider_id: ollama - provider_model_id: llama3.1:70b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:70b - provider_id: ollama - provider_model_id: llama3.1:70b - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:405b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.1:405b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: ollama - provider_model_id: llama3.1:405b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:405b - provider_id: ollama - provider_model_id: llama3.1:405b - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:1b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2:1b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-1B-Instruct - provider_id: ollama - provider_model_id: llama3.2:1b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:1b - provider_id: ollama - provider_model_id: llama3.2:1b - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:3b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2:3b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-3B-Instruct - provider_id: ollama - provider_model_id: llama3.2:3b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:3b - provider_id: ollama - provider_model_id: llama3.2:3b - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:11b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2-vision:11b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: ollama - provider_model_id: llama3.2-vision:11b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:latest - provider_id: ollama - provider_model_id: llama3.2-vision:latest - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:90b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2-vision:90b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: ollama - provider_model_id: llama3.2-vision:90b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:90b - provider_id: ollama - provider_model_id: llama3.2-vision:90b - model_type: llm -- metadata: {} - model_id: ollama/llama3.3:70b - provider_id: ollama - provider_model_id: llama3.3:70b - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.3-70B-Instruct - provider_id: ollama - provider_model_id: llama3.3:70b - model_type: llm -- metadata: {} - model_id: ollama/llama-guard3:8b - provider_id: ollama - provider_model_id: llama-guard3:8b - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-Guard-3-8B - provider_id: ollama - provider_model_id: llama-guard3:8b - model_type: llm -- metadata: {} - model_id: ollama/llama-guard3:1b - provider_id: ollama - provider_model_id: llama-guard3:1b - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-Guard-3-1B - provider_id: ollama - provider_model_id: llama-guard3:1b + provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:__disabled__} model_type: llm - metadata: - embedding_dimension: 384 - context_length: 512 - model_id: ollama/all-minilm:latest + embedding_dimension: ${env.OLLAMA_EMBEDDING_DIMENSION:384} + model_id: ollama/${env.OLLAMA_EMBEDDING_MODEL:__disabled__} provider_id: ollama - provider_model_id: all-minilm:latest - model_type: embedding -- metadata: - embedding_dimension: 384 - context_length: 512 - model_id: ollama/all-minilm - provider_id: ollama - provider_model_id: all-minilm:latest - model_type: embedding -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: ollama/nomic-embed-text - provider_id: ollama - provider_model_id: nomic-embed-text + provider_model_id: ${env.OLLAMA_EMBEDDING_MODEL:__disabled__} model_type: embedding - metadata: {} model_id: anthropic/claude-3-5-sonnet-latest @@ -938,6 +801,11 @@ models: provider_id: sambanova provider_model_id: sambanova/Meta-Llama-Guard-3-8B model_type: llm +- metadata: {} + model_id: vllm/${env.VLLM_INFERENCE_MODEL:__disabled__} + provider_id: vllm + provider_model_id: ${env.VLLM_INFERENCE_MODEL:__disabled__} + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py index 2a44a0a37..ec01d08e9 100644 --- a/llama_stack/templates/starter/starter.py +++ b/llama_stack/templates/starter/starter.py @@ -16,6 +16,7 @@ from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplCo from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) +from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( SQLiteVectorIOConfig, ) @@ -36,9 +37,6 @@ from llama_stack.providers.remote.inference.groq.models import ( MODEL_ENTRIES as GROQ_MODEL_ENTRIES, ) from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig -from llama_stack.providers.remote.inference.ollama.models import ( - MODEL_ENTRIES as OLLAMA_MODEL_ENTRIES, -) from llama_stack.providers.remote.inference.openai.config import OpenAIConfig from llama_stack.providers.remote.inference.openai.models import ( MODEL_ENTRIES as OPENAI_MODEL_ENTRIES, @@ -85,8 +83,22 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo ), ( "ollama", - OLLAMA_MODEL_ENTRIES, - OllamaImplConfig.sample_run_config(), + [ + ProviderModelEntry( + provider_model_id="${env.OLLAMA_INFERENCE_MODEL:__disabled__}", + model_type=ModelType.llm, + ), + ProviderModelEntry( + provider_model_id="${env.OLLAMA_EMBEDDING_MODEL:__disabled__}", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": "${env.OLLAMA_EMBEDDING_DIMENSION:384}", + }, + ), + ], + OllamaImplConfig.sample_run_config( + url="${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error=False + ), ), ( "anthropic", @@ -110,7 +122,12 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo ), ( "vllm", - [], + [ + ProviderModelEntry( + provider_model_id="${env.VLLM_INFERENCE_MODEL:__disabled__}", + model_type=ModelType.llm, + ), + ], VLLMInferenceAdapterConfig.sample_run_config( url="${env.VLLM_URL:http://localhost:8000/v1}", ), @@ -153,7 +170,12 @@ def get_distribution_template() -> DistributionTemplate: vector_io_providers = [ Provider( - provider_id="sqlite-vec", + provider_id="faiss", + provider_type="inline::faiss", + config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ), + Provider( + provider_id="${env.ENABLE_SQLITE_VEC+sqlite-vec}", provider_type="inline::sqlite-vec", config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), ), @@ -257,7 +279,27 @@ def get_distribution_template() -> DistributionTemplate: ), "VLLM_URL": ( "http://localhost:8000/v1", - "VLLM URL", + "vLLM URL", + ), + "VLLM_INFERENCE_MODEL": ( + "", + "Optional vLLM Inference Model to register on startup", + ), + "OLLAMA_URL": ( + "http://localhost:11434", + "Ollama URL", + ), + "OLLAMA_INFERENCE_MODEL": ( + "", + "Optional Ollama Inference Model to register on startup", + ), + "OLLAMA_EMBEDDING_MODEL": ( + "", + "Optional Ollama Embedding Model to register on startup", + ), + "OLLAMA_EMBEDDING_DIMENSION": ( + "384", + "Ollama Embedding Dimension", ), }, )