mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
fix: Ollama should be optional in starter distro (#2482)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 2s
Integration Tests / test-matrix (http, 3.12, agents) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.13, datasets) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, providers) (push) Failing after 13s
Integration Tests / test-matrix (http, 3.13, providers) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.12, inspect) (push) Failing after 17s
Integration Tests / test-matrix (http, 3.12, scoring) (push) Failing after 18s
Integration Tests / test-matrix (http, 3.13, inference) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, datasets) (push) Failing after 20s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 10s
Integration Tests / test-matrix (http, 3.12, vector_io) (push) Failing after 15s
Integration Tests / test-matrix (http, 3.12, inference) (push) Failing after 20s
Integration Tests / test-matrix (http, 3.12, post_training) (push) Failing after 15s
Integration Tests / test-matrix (http, 3.13, agents) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.13, post_training) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 13s
Integration Tests / test-matrix (http, 3.13, vector_io) (push) Failing after 14s
Integration Tests / test-matrix (http, 3.13, scoring) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.12, tool_runtime) (push) Failing after 18s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (http, 3.13, inspect) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.13, tool_runtime) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 10s
Test Llama Stack Build / generate-matrix (push) Successful in 7s
Python Package Build Test / build (3.12) (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 5s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 8s
Update ReadTheDocs / update-readthedocs (push) Failing after 4s
Unit Tests / unit-tests (3.12) (push) Failing after 6s
Unit Tests / unit-tests (3.13) (push) Failing after 5s
Test Llama Stack Build / build (push) Failing after 6s
Test Llama Stack Build / build-single-provider (push) Failing after 1m10s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 1m8s
Python Package Build Test / build (3.13) (push) Failing after 1m6s
Test External Providers / test-external-providers (venv) (push) Failing after 1m4s
Pre-commit / pre-commit (push) Successful in 2m33s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 2s
Integration Tests / test-matrix (http, 3.12, agents) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.13, datasets) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, providers) (push) Failing after 13s
Integration Tests / test-matrix (http, 3.13, providers) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.12, inspect) (push) Failing after 17s
Integration Tests / test-matrix (http, 3.12, scoring) (push) Failing after 18s
Integration Tests / test-matrix (http, 3.13, inference) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.12, datasets) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.12, datasets) (push) Failing after 20s
Integration Tests / test-matrix (library, 3.12, providers) (push) Failing after 10s
Integration Tests / test-matrix (http, 3.12, vector_io) (push) Failing after 15s
Integration Tests / test-matrix (http, 3.12, inference) (push) Failing after 20s
Integration Tests / test-matrix (http, 3.12, post_training) (push) Failing after 15s
Integration Tests / test-matrix (http, 3.13, agents) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.12, agents) (push) Failing after 11s
Integration Tests / test-matrix (http, 3.13, post_training) (push) Failing after 13s
Integration Tests / test-matrix (library, 3.12, inference) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, post_training) (push) Failing after 13s
Integration Tests / test-matrix (http, 3.13, vector_io) (push) Failing after 14s
Integration Tests / test-matrix (http, 3.13, scoring) (push) Failing after 11s
Integration Tests / test-matrix (library, 3.12, scoring) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.12, vector_io) (push) Failing after 9s
Integration Tests / test-matrix (http, 3.12, tool_runtime) (push) Failing after 18s
Integration Tests / test-matrix (library, 3.12, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (http, 3.13, inspect) (push) Failing after 16s
Integration Tests / test-matrix (library, 3.12, inspect) (push) Failing after 12s
Integration Tests / test-matrix (http, 3.13, tool_runtime) (push) Failing after 14s
Integration Tests / test-matrix (library, 3.13, inspect) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, agents) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, inference) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, datasets) (push) Failing after 10s
Test Llama Stack Build / generate-matrix (push) Successful in 7s
Python Package Build Test / build (3.12) (push) Failing after 4s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 5s
Integration Tests / test-matrix (library, 3.13, post_training) (push) Failing after 9s
Integration Tests / test-matrix (library, 3.13, tool_runtime) (push) Failing after 7s
Integration Tests / test-matrix (library, 3.13, providers) (push) Failing after 8s
Integration Tests / test-matrix (library, 3.13, scoring) (push) Failing after 10s
Integration Tests / test-matrix (library, 3.13, vector_io) (push) Failing after 8s
Update ReadTheDocs / update-readthedocs (push) Failing after 4s
Unit Tests / unit-tests (3.12) (push) Failing after 6s
Unit Tests / unit-tests (3.13) (push) Failing after 5s
Test Llama Stack Build / build (push) Failing after 6s
Test Llama Stack Build / build-single-provider (push) Failing after 1m10s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 1m8s
Python Package Build Test / build (3.13) (push) Failing after 1m6s
Test External Providers / test-external-providers (venv) (push) Failing after 1m4s
Pre-commit / pre-commit (push) Successful in 2m33s
# What does this PR do? Our starter distro required Ollama to be running (and a large list of models available in that Ollama) to successfully start. This adjusts things so that Ollama does not have to be running to use the starter template / distro. To accomplish this, a few changes were needed: * The Ollama provider is now configurable whether it raises an Exception or just logs a warning when it cannot reach the Ollama server on startup. The default is to raise an exception (same as previous behavior), but in the starter template we adjust this to just log a warning so that we can bring the stack up without needing a running Ollama server. * The starter template no longer specifies a default list of models for Ollama, as any models specified there need to actually be pulled and available in Ollama. Instead, it adds a new `OLLAMA_INFERENCE_MODEL` environment variable where users can provide an optional model to register with the Ollama provider on startup. Additional models can also be registered via the typical `models.register(...)` at runtime. * The vLLM template was adjusted to also allow an optional `VLLM_INFERENCE_MODEL` specified on startup, so that the behavior between vLLM and Ollama was consistent here to make it easy to get up and running quickly. * The default vector store was changed from sqlite-vec to faiss. sqlite-vec can enabled via setting the `ENABLE_SQLITE_VEC` environment variable, like we do for chromadb and pgvector. This is due to sqlite-vec not shipping proper arm64 binaries, like we previously fixed in #1530 for the ollama distribution. ## Test Plan With this change, the following scenarios now work with the starter template that did not before: * no Ollama running * Ollama running but not all of the Llama models pulled locally * Ollama running with a custom model registered on startup * vLLM running with a custom model registered on startup * running the starter template on linux/arm64, like when running containers on Mac without rosetta emulation --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
cfee63bd0d
commit
fa0b0c13d4
10 changed files with 121 additions and 173 deletions
|
@ -109,6 +109,18 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i
|
||||||
|
|
||||||
What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
|
What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
|
||||||
|
|
||||||
|
If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
models:
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL:__disabled__}
|
||||||
|
provider_id: ollama
|
||||||
|
provider_model_id: ${env.INFERENCE_MODEL:__disabled__}
|
||||||
|
```
|
||||||
|
|
||||||
|
The snippet above will only register this model if the environment variable `INFERENCE_MODEL` is set and non-empty. If the environment variable is not set, the model will not get registered at all.
|
||||||
|
|
||||||
## Server Configuration
|
## Server Configuration
|
||||||
|
|
||||||
The `server` section configures the HTTP server that serves the Llama Stack APIs:
|
The `server` section configures the HTTP server that serves the Llama Stack APIs:
|
||||||
|
|
|
@ -98,6 +98,15 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
|
||||||
|
|
||||||
method = getattr(impls[api], register_method)
|
method = getattr(impls[api], register_method)
|
||||||
for obj in objects:
|
for obj in objects:
|
||||||
|
# In complex templates, like our starter template, we may have dynamic model ids
|
||||||
|
# given by environment variables. This allows those environment variables to have
|
||||||
|
# a default value of __disabled__ to skip registration of the model if not set.
|
||||||
|
if (
|
||||||
|
hasattr(obj, "provider_model_id")
|
||||||
|
and obj.provider_model_id is not None
|
||||||
|
and "__disabled__" in obj.provider_model_id
|
||||||
|
):
|
||||||
|
continue
|
||||||
# we want to maintain the type information in arguments to method.
|
# we want to maintain the type information in arguments to method.
|
||||||
# instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
|
# instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
|
||||||
# we use model_dump() to find all the attrs and then getattr to get the still typed value.
|
# we use model_dump() to find all the attrs and then getattr to get the still typed value.
|
||||||
|
|
|
@ -10,6 +10,6 @@ from .config import OllamaImplConfig
|
||||||
async def get_adapter_impl(config: OllamaImplConfig, _deps):
|
async def get_adapter_impl(config: OllamaImplConfig, _deps):
|
||||||
from .ollama import OllamaInferenceAdapter
|
from .ollama import OllamaInferenceAdapter
|
||||||
|
|
||||||
impl = OllamaInferenceAdapter(config.url)
|
impl = OllamaInferenceAdapter(config)
|
||||||
await impl.initialize()
|
await impl.initialize()
|
||||||
return impl
|
return impl
|
||||||
|
|
|
@ -13,7 +13,13 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"
|
||||||
|
|
||||||
class OllamaImplConfig(BaseModel):
|
class OllamaImplConfig(BaseModel):
|
||||||
url: str = DEFAULT_OLLAMA_URL
|
url: str = DEFAULT_OLLAMA_URL
|
||||||
|
raise_on_connect_error: bool = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]:
|
def sample_run_config(
|
||||||
return {"url": url}
|
cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"raise_on_connect_error": raise_on_connect_error,
|
||||||
|
}
|
||||||
|
|
|
@ -9,7 +9,6 @@ import uuid
|
||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
|
||||||
from ollama import AsyncClient # type: ignore[attr-defined]
|
from ollama import AsyncClient # type: ignore[attr-defined]
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
|
@ -57,6 +56,7 @@ from llama_stack.providers.datatypes import (
|
||||||
HealthStatus,
|
HealthStatus,
|
||||||
ModelsProtocolPrivate,
|
ModelsProtocolPrivate,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
|
||||||
from llama_stack.providers.utils.inference.model_registry import (
|
from llama_stack.providers.utils.inference.model_registry import (
|
||||||
ModelRegistryHelper,
|
ModelRegistryHelper,
|
||||||
)
|
)
|
||||||
|
@ -90,9 +90,10 @@ class OllamaInferenceAdapter(
|
||||||
InferenceProvider,
|
InferenceProvider,
|
||||||
ModelsProtocolPrivate,
|
ModelsProtocolPrivate,
|
||||||
):
|
):
|
||||||
def __init__(self, url: str) -> None:
|
def __init__(self, config: OllamaImplConfig) -> None:
|
||||||
self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
|
self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
|
||||||
self.url = url
|
self.url = config.url
|
||||||
|
self.raise_on_connect_error = config.raise_on_connect_error
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def client(self) -> AsyncClient:
|
def client(self) -> AsyncClient:
|
||||||
|
@ -103,8 +104,13 @@ class OllamaInferenceAdapter(
|
||||||
return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
|
return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None:
|
||||||
logger.info(f"checking connectivity to Ollama at `{self.url}`...")
|
logger.debug(f"checking connectivity to Ollama at `{self.url}`...")
|
||||||
await self.health()
|
health_response = await self.health()
|
||||||
|
if health_response["status"] == HealthStatus.ERROR:
|
||||||
|
if self.raise_on_connect_error:
|
||||||
|
raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
|
||||||
|
else:
|
||||||
|
logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal")
|
||||||
|
|
||||||
async def health(self) -> HealthResponse:
|
async def health(self) -> HealthResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -117,10 +123,8 @@ class OllamaInferenceAdapter(
|
||||||
try:
|
try:
|
||||||
await self.client.ps()
|
await self.client.ps()
|
||||||
return HealthResponse(status=HealthStatus.OK)
|
return HealthResponse(status=HealthStatus.OK)
|
||||||
except httpx.ConnectError as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
|
||||||
"Ollama Server is not running, start it using `ollama serve` in a separate terminal"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
async def shutdown(self) -> None:
|
async def shutdown(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from openai import AsyncOpenAI
|
from openai import APIConnectionError, AsyncOpenAI
|
||||||
from openai.types.chat.chat_completion_chunk import (
|
from openai.types.chat.chat_completion_chunk import (
|
||||||
ChatCompletionChunk as OpenAIChatCompletionChunk,
|
ChatCompletionChunk as OpenAIChatCompletionChunk,
|
||||||
)
|
)
|
||||||
|
@ -461,7 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
model = await self.register_helper.register_model(model)
|
model = await self.register_helper.register_model(model)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass # Ignore statically unknown model, will check live listing
|
pass # Ignore statically unknown model, will check live listing
|
||||||
res = await client.models.list()
|
try:
|
||||||
|
res = await client.models.list()
|
||||||
|
except APIConnectionError as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
|
||||||
|
) from e
|
||||||
available_models = [m.id async for m in res]
|
available_models = [m.id async for m in res]
|
||||||
if model.provider_resource_id not in available_models:
|
if model.provider_resource_id not in available_models:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|
|
@ -18,6 +18,7 @@ providers:
|
||||||
provider_type: remote::ollama
|
provider_type: remote::ollama
|
||||||
config:
|
config:
|
||||||
url: ${env.OLLAMA_URL:http://localhost:11434}
|
url: ${env.OLLAMA_URL:http://localhost:11434}
|
||||||
|
raise_on_connect_error: true
|
||||||
vector_io:
|
vector_io:
|
||||||
- provider_id: faiss
|
- provider_id: faiss
|
||||||
provider_type: inline::faiss
|
provider_type: inline::faiss
|
||||||
|
|
|
@ -18,6 +18,7 @@ providers:
|
||||||
provider_type: remote::ollama
|
provider_type: remote::ollama
|
||||||
config:
|
config:
|
||||||
url: ${env.OLLAMA_URL:http://localhost:11434}
|
url: ${env.OLLAMA_URL:http://localhost:11434}
|
||||||
|
raise_on_connect_error: true
|
||||||
vector_io:
|
vector_io:
|
||||||
- provider_id: faiss
|
- provider_id: faiss
|
||||||
provider_type: inline::faiss
|
provider_type: inline::faiss
|
||||||
|
|
|
@ -31,6 +31,7 @@ providers:
|
||||||
provider_type: remote::ollama
|
provider_type: remote::ollama
|
||||||
config:
|
config:
|
||||||
url: ${env.OLLAMA_URL:http://localhost:11434}
|
url: ${env.OLLAMA_URL:http://localhost:11434}
|
||||||
|
raise_on_connect_error: false
|
||||||
- provider_id: anthropic
|
- provider_id: anthropic
|
||||||
provider_type: remote::anthropic
|
provider_type: remote::anthropic
|
||||||
config:
|
config:
|
||||||
|
@ -60,7 +61,14 @@ providers:
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
vector_io:
|
vector_io:
|
||||||
- provider_id: sqlite-vec
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/faiss_store.db
|
||||||
|
- provider_id: ${env.ENABLE_SQLITE_VEC+sqlite-vec}
|
||||||
provider_type: inline::sqlite-vec
|
provider_type: inline::sqlite-vec
|
||||||
config:
|
config:
|
||||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db
|
||||||
|
@ -530,160 +538,15 @@ models:
|
||||||
provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- metadata: {}
|
- metadata: {}
|
||||||
model_id: ollama/llama3.1:8b-instruct-fp16
|
model_id: ollama/${env.OLLAMA_INFERENCE_MODEL:__disabled__}
|
||||||
provider_id: ollama
|
provider_id: ollama
|
||||||
provider_model_id: llama3.1:8b-instruct-fp16
|
provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:__disabled__}
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:8b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.1:8b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:8b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.1:70b-instruct-fp16
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:70b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.1-70B-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:70b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.1:70b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:70b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.1:405b-instruct-fp16
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:405b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.1-405B-Instruct-FP8
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:405b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.1:405b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.1:405b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2:1b-instruct-fp16
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2:1b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.2-1B-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2:1b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2:1b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2:1b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2:3b-instruct-fp16
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2:3b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.2-3B-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2:3b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2:3b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2:3b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2-vision:11b-instruct-fp16
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2-vision:11b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.2-11B-Vision-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2-vision:11b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2-vision:latest
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2-vision:latest
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2-vision:90b-instruct-fp16
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2-vision:90b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.2-90B-Vision-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2-vision:90b-instruct-fp16
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.2-vision:90b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.2-vision:90b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama3.3:70b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.3:70b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-3.3-70B-Instruct
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama3.3:70b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama-guard3:8b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama-guard3:8b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-Guard-3-8B
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama-guard3:8b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/llama-guard3:1b
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama-guard3:1b
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ollama/meta-llama/Llama-Guard-3-1B
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: llama-guard3:1b
|
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- metadata:
|
- metadata:
|
||||||
embedding_dimension: 384
|
embedding_dimension: ${env.OLLAMA_EMBEDDING_DIMENSION:384}
|
||||||
context_length: 512
|
model_id: ollama/${env.OLLAMA_EMBEDDING_MODEL:__disabled__}
|
||||||
model_id: ollama/all-minilm:latest
|
|
||||||
provider_id: ollama
|
provider_id: ollama
|
||||||
provider_model_id: all-minilm:latest
|
provider_model_id: ${env.OLLAMA_EMBEDDING_MODEL:__disabled__}
|
||||||
model_type: embedding
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 384
|
|
||||||
context_length: 512
|
|
||||||
model_id: ollama/all-minilm
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: all-minilm:latest
|
|
||||||
model_type: embedding
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 768
|
|
||||||
context_length: 8192
|
|
||||||
model_id: ollama/nomic-embed-text
|
|
||||||
provider_id: ollama
|
|
||||||
provider_model_id: nomic-embed-text
|
|
||||||
model_type: embedding
|
model_type: embedding
|
||||||
- metadata: {}
|
- metadata: {}
|
||||||
model_id: anthropic/claude-3-5-sonnet-latest
|
model_id: anthropic/claude-3-5-sonnet-latest
|
||||||
|
@ -938,6 +801,11 @@ models:
|
||||||
provider_id: sambanova
|
provider_id: sambanova
|
||||||
provider_model_id: sambanova/Meta-Llama-Guard-3-8B
|
provider_model_id: sambanova/Meta-Llama-Guard-3-8B
|
||||||
model_type: llm
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: vllm/${env.VLLM_INFERENCE_MODEL:__disabled__}
|
||||||
|
provider_id: vllm
|
||||||
|
provider_model_id: ${env.VLLM_INFERENCE_MODEL:__disabled__}
|
||||||
|
model_type: llm
|
||||||
- metadata:
|
- metadata:
|
||||||
embedding_dimension: 384
|
embedding_dimension: 384
|
||||||
model_id: all-MiniLM-L6-v2
|
model_id: all-MiniLM-L6-v2
|
||||||
|
|
|
@ -16,6 +16,7 @@ from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplCo
|
||||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
SentenceTransformersInferenceConfig,
|
SentenceTransformersInferenceConfig,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
|
from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
|
||||||
SQLiteVectorIOConfig,
|
SQLiteVectorIOConfig,
|
||||||
)
|
)
|
||||||
|
@ -36,9 +37,6 @@ from llama_stack.providers.remote.inference.groq.models import (
|
||||||
MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
|
MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
|
from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
|
||||||
from llama_stack.providers.remote.inference.ollama.models import (
|
|
||||||
MODEL_ENTRIES as OLLAMA_MODEL_ENTRIES,
|
|
||||||
)
|
|
||||||
from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
|
from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
|
||||||
from llama_stack.providers.remote.inference.openai.models import (
|
from llama_stack.providers.remote.inference.openai.models import (
|
||||||
MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
|
MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
|
||||||
|
@ -85,8 +83,22 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"ollama",
|
"ollama",
|
||||||
OLLAMA_MODEL_ENTRIES,
|
[
|
||||||
OllamaImplConfig.sample_run_config(),
|
ProviderModelEntry(
|
||||||
|
provider_model_id="${env.OLLAMA_INFERENCE_MODEL:__disabled__}",
|
||||||
|
model_type=ModelType.llm,
|
||||||
|
),
|
||||||
|
ProviderModelEntry(
|
||||||
|
provider_model_id="${env.OLLAMA_EMBEDDING_MODEL:__disabled__}",
|
||||||
|
model_type=ModelType.embedding,
|
||||||
|
metadata={
|
||||||
|
"embedding_dimension": "${env.OLLAMA_EMBEDDING_DIMENSION:384}",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
OllamaImplConfig.sample_run_config(
|
||||||
|
url="${env.OLLAMA_URL:http://localhost:11434}", raise_on_connect_error=False
|
||||||
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"anthropic",
|
"anthropic",
|
||||||
|
@ -110,7 +122,12 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"vllm",
|
"vllm",
|
||||||
[],
|
[
|
||||||
|
ProviderModelEntry(
|
||||||
|
provider_model_id="${env.VLLM_INFERENCE_MODEL:__disabled__}",
|
||||||
|
model_type=ModelType.llm,
|
||||||
|
),
|
||||||
|
],
|
||||||
VLLMInferenceAdapterConfig.sample_run_config(
|
VLLMInferenceAdapterConfig.sample_run_config(
|
||||||
url="${env.VLLM_URL:http://localhost:8000/v1}",
|
url="${env.VLLM_URL:http://localhost:8000/v1}",
|
||||||
),
|
),
|
||||||
|
@ -153,7 +170,12 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
|
|
||||||
vector_io_providers = [
|
vector_io_providers = [
|
||||||
Provider(
|
Provider(
|
||||||
provider_id="sqlite-vec",
|
provider_id="faiss",
|
||||||
|
provider_type="inline::faiss",
|
||||||
|
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
||||||
|
),
|
||||||
|
Provider(
|
||||||
|
provider_id="${env.ENABLE_SQLITE_VEC+sqlite-vec}",
|
||||||
provider_type="inline::sqlite-vec",
|
provider_type="inline::sqlite-vec",
|
||||||
config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
||||||
),
|
),
|
||||||
|
@ -257,7 +279,27 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
),
|
),
|
||||||
"VLLM_URL": (
|
"VLLM_URL": (
|
||||||
"http://localhost:8000/v1",
|
"http://localhost:8000/v1",
|
||||||
"VLLM URL",
|
"vLLM URL",
|
||||||
|
),
|
||||||
|
"VLLM_INFERENCE_MODEL": (
|
||||||
|
"",
|
||||||
|
"Optional vLLM Inference Model to register on startup",
|
||||||
|
),
|
||||||
|
"OLLAMA_URL": (
|
||||||
|
"http://localhost:11434",
|
||||||
|
"Ollama URL",
|
||||||
|
),
|
||||||
|
"OLLAMA_INFERENCE_MODEL": (
|
||||||
|
"",
|
||||||
|
"Optional Ollama Inference Model to register on startup",
|
||||||
|
),
|
||||||
|
"OLLAMA_EMBEDDING_MODEL": (
|
||||||
|
"",
|
||||||
|
"Optional Ollama Embedding Model to register on startup",
|
||||||
|
),
|
||||||
|
"OLLAMA_EMBEDDING_DIMENSION": (
|
||||||
|
"384",
|
||||||
|
"Ollama Embedding Dimension",
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue