diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 5a9708497..b33b0d3f7 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -98,6 +98,15 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]): method = getattr(impls[api], register_method) for obj in objects: + # In complex templates, like our starter template, we may have dynamic model ids + # given by environment variables. This allows those environment variables to have + # a default value of __disabled__ to skip registration of the model if not set. + if ( + hasattr(obj, "provider_model_id") + and obj.provider_model_id is not None + and "__disabled__" in obj.provider_model_id + ): + continue # we want to maintain the type information in arguments to method. # instead of method(**obj.model_dump()), which may convert a typed attr to a dict, # we use model_dump() to find all the attrs and then getattr to get the still typed value. diff --git a/llama_stack/providers/remote/inference/ollama/__init__.py b/llama_stack/providers/remote/inference/ollama/__init__.py index 073c31cde..9f4adc75f 100644 --- a/llama_stack/providers/remote/inference/ollama/__init__.py +++ b/llama_stack/providers/remote/inference/ollama/__init__.py @@ -10,6 +10,6 @@ from .config import OllamaImplConfig async def get_adapter_impl(config: OllamaImplConfig, _deps): from .ollama import OllamaInferenceAdapter - impl = OllamaInferenceAdapter(config.url) + impl = OllamaInferenceAdapter(config.url, raise_on_connect_error=config.raise_on_connect_error) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py index 0e4aef0e1..37b827f4f 100644 --- a/llama_stack/providers/remote/inference/ollama/config.py +++ b/llama_stack/providers/remote/inference/ollama/config.py @@ -13,7 +13,13 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434" class OllamaImplConfig(BaseModel): url: str = DEFAULT_OLLAMA_URL + raise_on_connect_error: bool = True @classmethod - def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]: - return {"url": url} + def sample_run_config( + cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs + ) -> dict[str, Any]: + return { + "url": url, + "raise_on_connect_error": raise_on_connect_error, + } diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index d51072fbf..d81d21dac 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -9,7 +9,6 @@ import uuid from collections.abc import AsyncGenerator, AsyncIterator from typing import Any -import httpx from ollama import AsyncClient # type: ignore[attr-defined] from openai import AsyncOpenAI @@ -90,9 +89,10 @@ class OllamaInferenceAdapter( InferenceProvider, ModelsProtocolPrivate, ): - def __init__(self, url: str) -> None: + def __init__(self, url: str, raise_on_connect_error: bool = True) -> None: self.register_helper = ModelRegistryHelper(MODEL_ENTRIES) self.url = url + self.raise_on_connect_error = raise_on_connect_error @property def client(self) -> AsyncClient: @@ -103,8 +103,13 @@ class OllamaInferenceAdapter( return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama") async def initialize(self) -> None: - logger.info(f"checking connectivity to Ollama at `{self.url}`...") - await self.health() + logger.debug(f"checking connectivity to Ollama at `{self.url}`...") + health_response = await self.health() + if health_response["status"] == HealthStatus.ERROR: + if self.raise_on_connect_error: + raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal") + else: + logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal") async def health(self) -> HealthResponse: """ @@ -117,10 +122,8 @@ class OllamaInferenceAdapter( try: await self.client.ps() return HealthResponse(status=HealthStatus.OK) - except httpx.ConnectError as e: - raise RuntimeError( - "Ollama Server is not running, start it using `ollama serve` in a separate terminal" - ) from e + except Exception as e: + return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}") async def shutdown(self) -> None: pass diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 3424be6b4..ae04f206a 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator, AsyncIterator from typing import Any import httpx -from openai import AsyncOpenAI +from openai import APIConnectionError, AsyncOpenAI from openai.types.chat.chat_completion_chunk import ( ChatCompletionChunk as OpenAIChatCompletionChunk, ) @@ -461,7 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): model = await self.register_helper.register_model(model) except ValueError: pass # Ignore statically unknown model, will check live listing - res = await client.models.list() + try: + res = await client.models.list() + except APIConnectionError as e: + raise ValueError( + f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL." + ) from e available_models = [m.id async for m in res] if model.provider_resource_id not in available_models: raise ValueError( diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 85d5c813b..2e1b7fdcc 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -18,6 +18,7 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + raise_on_connect_error: true vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 2d10a99a4..8c2b17ef1 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -18,6 +18,7 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + raise_on_connect_error: true vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml index 960e96d01..de0c12d90 100644 --- a/llama_stack/templates/starter/run.yaml +++ b/llama_stack/templates/starter/run.yaml @@ -31,6 +31,7 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} + raise_on_connect_error: false - provider_id: anthropic provider_type: remote::anthropic config: @@ -60,7 +61,14 @@ providers: provider_type: inline::sentence-transformers config: {} vector_io: - - provider_id: sqlite-vec + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/faiss_store.db + - provider_id: ${env.ENABLE_SQLITE_VEC+sqlite-vec} provider_type: inline::sqlite-vec config: db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db @@ -530,161 +538,10 @@ models: provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 model_type: llm - metadata: {} - model_id: ollama/llama3.1:8b-instruct-fp16 + model_id: ollama/${env.OLLAMA_INFERENCE_MODEL:__disabled__} provider_id: ollama - provider_model_id: llama3.1:8b-instruct-fp16 + provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:__disabled__} model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.1-8B-Instruct - provider_id: ollama - provider_model_id: llama3.1:8b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:8b - provider_id: ollama - provider_model_id: llama3.1:8b - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:70b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.1:70b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.1-70B-Instruct - provider_id: ollama - provider_model_id: llama3.1:70b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:70b - provider_id: ollama - provider_model_id: llama3.1:70b - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:405b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.1:405b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: ollama - provider_model_id: llama3.1:405b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.1:405b - provider_id: ollama - provider_model_id: llama3.1:405b - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:1b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2:1b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-1B-Instruct - provider_id: ollama - provider_model_id: llama3.2:1b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:1b - provider_id: ollama - provider_model_id: llama3.2:1b - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:3b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2:3b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-3B-Instruct - provider_id: ollama - provider_model_id: llama3.2:3b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2:3b - provider_id: ollama - provider_model_id: llama3.2:3b - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:11b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2-vision:11b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: ollama - provider_model_id: llama3.2-vision:11b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:latest - provider_id: ollama - provider_model_id: llama3.2-vision:latest - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:90b-instruct-fp16 - provider_id: ollama - provider_model_id: llama3.2-vision:90b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: ollama - provider_model_id: llama3.2-vision:90b-instruct-fp16 - model_type: llm -- metadata: {} - model_id: ollama/llama3.2-vision:90b - provider_id: ollama - provider_model_id: llama3.2-vision:90b - model_type: llm -- metadata: {} - model_id: ollama/llama3.3:70b - provider_id: ollama - provider_model_id: llama3.3:70b - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-3.3-70B-Instruct - provider_id: ollama - provider_model_id: llama3.3:70b - model_type: llm -- metadata: {} - model_id: ollama/llama-guard3:8b - provider_id: ollama - provider_model_id: llama-guard3:8b - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-Guard-3-8B - provider_id: ollama - provider_model_id: llama-guard3:8b - model_type: llm -- metadata: {} - model_id: ollama/llama-guard3:1b - provider_id: ollama - provider_model_id: llama-guard3:1b - model_type: llm -- metadata: {} - model_id: ollama/meta-llama/Llama-Guard-3-1B - provider_id: ollama - provider_model_id: llama-guard3:1b - model_type: llm -- metadata: - embedding_dimension: 384 - context_length: 512 - model_id: ollama/all-minilm:latest - provider_id: ollama - provider_model_id: all-minilm:latest - model_type: embedding -- metadata: - embedding_dimension: 384 - context_length: 512 - model_id: ollama/all-minilm - provider_id: ollama - provider_model_id: all-minilm:latest - model_type: embedding -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: ollama/nomic-embed-text - provider_id: ollama - provider_model_id: nomic-embed-text - model_type: embedding - metadata: {} model_id: anthropic/claude-3-5-sonnet-latest provider_id: anthropic @@ -938,6 +795,11 @@ models: provider_id: sambanova provider_model_id: sambanova/Meta-Llama-Guard-3-8B model_type: llm +- metadata: {} + model_id: vllm/${env.VLLM_INFERENCE_MODEL:__disabled__} + provider_id: vllm + provider_model_id: ${env.VLLM_INFERENCE_MODEL:__disabled__} + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py index 2a44a0a37..357f1aec1 100644 --- a/llama_stack/templates/starter/starter.py +++ b/llama_stack/templates/starter/starter.py @@ -16,6 +16,7 @@ from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplCo from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) +from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( SQLiteVectorIOConfig, ) @@ -36,9 +37,6 @@ from llama_stack.providers.remote.inference.groq.models import ( MODEL_ENTRIES as GROQ_MODEL_ENTRIES, ) from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig -from llama_stack.providers.remote.inference.ollama.models import ( - MODEL_ENTRIES as OLLAMA_MODEL_ENTRIES, -) from llama_stack.providers.remote.inference.openai.config import OpenAIConfig from llama_stack.providers.remote.inference.openai.models import ( MODEL_ENTRIES as OPENAI_MODEL_ENTRIES, @@ -85,8 +83,15 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo ), ( "ollama", - OLLAMA_MODEL_ENTRIES, - OllamaImplConfig.sample_run_config(), + [ + ProviderModelEntry( + provider_model_id="${env.OLLAMA_INFERENCE_MODEL:__disabled__}", + model_type=ModelType.llm, + ), + ], + OllamaImplConfig.sample_run_config( + url="${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error=False + ), ), ( "anthropic", @@ -110,7 +115,12 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo ), ( "vllm", - [], + [ + ProviderModelEntry( + provider_model_id="${env.VLLM_INFERENCE_MODEL:__disabled__}", + model_type=ModelType.llm, + ), + ], VLLMInferenceAdapterConfig.sample_run_config( url="${env.VLLM_URL:http://localhost:8000/v1}", ), @@ -153,7 +163,12 @@ def get_distribution_template() -> DistributionTemplate: vector_io_providers = [ Provider( - provider_id="sqlite-vec", + provider_id="faiss", + provider_type="inline::faiss", + config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ), + Provider( + provider_id="${env.ENABLE_SQLITE_VEC+sqlite-vec}", provider_type="inline::sqlite-vec", config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), ), @@ -257,7 +272,19 @@ def get_distribution_template() -> DistributionTemplate: ), "VLLM_URL": ( "http://localhost:8000/v1", - "VLLM URL", + "vLLM URL", + ), + "VLLM_INFERENCE_MODEL": ( + "", + "Optional vLLM Inference Model to register on startup", + ), + "OLLAMA_URL": ( + "http://localhost:11434", + "Ollama URL", + ), + "OLLAMA_INFERENCE_MODEL": ( + "", + "Optional Ollama Inference Model to register on startup", ), }, )