chore: move embedding deps to RAG tool where they are needed (#1210)

`EMBEDDING_DEPS` were wrongly associated with `vector_io` providers.
They are needed by
https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/utils/memory/vector_store.py#L142
and related code and is used by the RAG tool and as such should only be
needed by the `inline::rag-runtime` provider.
This commit is contained in:
Ashwin Bharambe 2025-02-21 11:33:41 -08:00 committed by GitHub
parent 11697f85c5
commit 992f865b2e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 85 additions and 132 deletions

View file

@ -30,9 +30,7 @@
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"uvicorn"
],
"cerebras": [
"aiosqlite",
@ -170,9 +168,7 @@
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"uvicorn"
],
"hf-serverless": [
"aiohttp",
@ -247,9 +243,7 @@
"tqdm",
"transformers",
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"zmq"
],
"meta-reference-quantized-gpu": [
"accelerate",
@ -290,9 +284,7 @@
"tqdm",
"transformers",
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"zmq"
],
"nvidia": [
"aiosqlite",
@ -323,9 +315,7 @@
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"uvicorn"
],
"ollama": [
"aiohttp",
@ -335,7 +325,6 @@
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
@ -359,9 +348,7 @@
"sqlite-vec",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"uvicorn"
],
"remote-vllm": [
"aiosqlite",
@ -424,9 +411,7 @@
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
"uvicorn"
],
"tgi": [
"aiohttp",

View file

@ -8,7 +8,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::cerebras` |
| inference | `remote::cerebras`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |

View file

@ -19,7 +19,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::tgi` |
| inference | `remote::tgi`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |

View file

@ -18,7 +18,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::fireworks` |
| inference | `remote::fireworks`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |

View file

@ -23,7 +23,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
| vector_io | `inline::faiss`, `inline::sqlite_vec`, `remote::chromadb`, `remote::pgvector` |
| vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.

View file

@ -17,7 +17,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::vllm` |
| inference | `remote::vllm`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |

View file

@ -19,7 +19,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::tgi` |
| inference | `remote::tgi`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |

View file

@ -18,7 +18,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::together` |
| inference | `remote::together`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |

View file

@ -178,6 +178,12 @@ class StackRun(Subcommand):
# else must be venv since that is the only valid option left.
current_venv = os.environ.get("VIRTUAL_ENV")
venv = args.image_name or current_venv
if not venv:
cprint(
"No current virtual environment detected, please specify a virtual environment name with --image-name",
color="red",
)
return
script = importlib.resources.files("llama_stack") / "distribution/start_venv.sh"
run_args = [
script,
@ -206,5 +212,4 @@ class StackRun(Subcommand):
if args.tls_keyfile and args.tls_certfile:
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
run_with_pty(run_args)

View file

@ -44,7 +44,6 @@ class SentenceTransformersInferenceImpl(
pass
async def register_model(self, model: Model) -> None:
_ = self._load_sentence_transformer_model(model.provider_resource_id)
return model
async def unregister_model(self, model_id: str) -> None:

View file

@ -61,7 +61,10 @@ def available_providers() -> List[ProviderSpec]:
InlineProviderSpec(
api=Api.inference,
provider_type="inline::sentence-transformers",
pip_packages=["sentence-transformers"],
pip_packages=[
"torch torchvision --index-url https://download.pytorch.org/whl/cpu",
"sentence-transformers --no-deps",
],
module="llama_stack.providers.inline.inference.sentence_transformers",
config_class="llama_stack.providers.inline.inference.sentence_transformers.config.SentenceTransformersInferenceConfig",
),

View file

@ -20,7 +20,18 @@ def available_providers() -> List[ProviderSpec]:
InlineProviderSpec(
api=Api.tool_runtime,
provider_type="inline::rag-runtime",
pip_packages=[],
pip_packages=[
"blobfile",
"chardet",
"pypdf",
"tqdm",
"numpy",
"scikit-learn",
"scipy",
"nltk",
"sentencepiece",
"transformers",
],
module="llama_stack.providers.inline.tool_runtime.rag",
config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
api_dependencies=[Api.vector_io, Api.inference],

View file

@ -14,33 +14,13 @@ from llama_stack.providers.datatypes import (
remote_provider_spec,
)
EMBEDDING_DEPS = [
"blobfile",
"chardet",
"pypdf",
"tqdm",
"numpy",
"scikit-learn",
"scipy",
"nltk",
"sentencepiece",
"transformers",
# this happens to work because special dependencies are always installed last
# so if there was a regular torch installed first, this would be ignored
# we need a better way to do this to identify potential conflicts, etc.
# for now, this lets us significantly reduce the size of the container which
# does not have any "local" inference code (and hence does not need GPU-enabled torch)
"torch torchvision --index-url https://download.pytorch.org/whl/cpu",
"sentence-transformers --no-deps",
]
def available_providers() -> List[ProviderSpec]:
return [
InlineProviderSpec(
api=Api.vector_io,
provider_type="inline::meta-reference",
pip_packages=EMBEDDING_DEPS + ["faiss-cpu"],
pip_packages=["faiss-cpu"],
module="llama_stack.providers.inline.vector_io.faiss",
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
deprecation_warning="Please use the `inline::faiss` provider instead.",
@ -49,24 +29,33 @@ def available_providers() -> List[ProviderSpec]:
InlineProviderSpec(
api=Api.vector_io,
provider_type="inline::faiss",
pip_packages=EMBEDDING_DEPS + ["faiss-cpu"],
pip_packages=["faiss-cpu"],
module="llama_stack.providers.inline.vector_io.faiss",
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
api_dependencies=[Api.inference],
),
InlineProviderSpec(
api=Api.vector_io,
provider_type="inline::sqlite_vec",
pip_packages=EMBEDDING_DEPS + ["sqlite-vec"],
provider_type="inline::sqlite-vec",
pip_packages=["sqlite-vec"],
module="llama_stack.providers.inline.vector_io.sqlite_vec",
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
api_dependencies=[Api.inference],
),
InlineProviderSpec(
api=Api.vector_io,
provider_type="inline::sqlite_vec",
pip_packages=["sqlite-vec"],
module="llama_stack.providers.inline.vector_io.sqlite_vec",
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.",
api_dependencies=[Api.inference],
),
remote_provider_spec(
Api.vector_io,
AdapterSpec(
adapter_type="chromadb",
pip_packages=EMBEDDING_DEPS + ["chromadb-client"],
pip_packages=["chromadb-client"],
module="llama_stack.providers.remote.vector_io.chroma",
config_class="llama_stack.providers.remote.vector_io.chroma.ChromaVectorIOConfig",
),
@ -75,7 +64,7 @@ def available_providers() -> List[ProviderSpec]:
InlineProviderSpec(
api=Api.vector_io,
provider_type="inline::chromadb",
pip_packages=EMBEDDING_DEPS + ["chromadb"],
pip_packages=["chromadb"],
module="llama_stack.providers.inline.vector_io.chroma",
config_class="llama_stack.providers.inline.vector_io.chroma.ChromaVectorIOConfig",
api_dependencies=[Api.inference],
@ -84,7 +73,7 @@ def available_providers() -> List[ProviderSpec]:
Api.vector_io,
AdapterSpec(
adapter_type="pgvector",
pip_packages=EMBEDDING_DEPS + ["psycopg2-binary"],
pip_packages=["psycopg2-binary"],
module="llama_stack.providers.remote.vector_io.pgvector",
config_class="llama_stack.providers.remote.vector_io.pgvector.PGVectorVectorIOConfig",
),
@ -94,7 +83,7 @@ def available_providers() -> List[ProviderSpec]:
Api.vector_io,
AdapterSpec(
adapter_type="weaviate",
pip_packages=EMBEDDING_DEPS + ["weaviate-client"],
pip_packages=["weaviate-client"],
module="llama_stack.providers.remote.vector_io.weaviate",
config_class="llama_stack.providers.remote.vector_io.weaviate.WeaviateVectorIOConfig",
provider_data_validator="llama_stack.providers.remote.vector_io.weaviate.WeaviateRequestProviderData",
@ -115,7 +104,7 @@ def available_providers() -> List[ProviderSpec]:
Api.vector_io,
AdapterSpec(
adapter_type="qdrant",
pip_packages=EMBEDDING_DEPS + ["qdrant-client"],
pip_packages=["qdrant-client"],
module="llama_stack.providers.remote.vector_io.qdrant",
config_class="llama_stack.providers.remote.vector_io.qdrant.QdrantVectorIOConfig",
),

View file

@ -61,7 +61,7 @@ def vector_io_sqlite_vec() -> ProviderFixture:
providers=[
Provider(
provider_id="sqlite_vec",
provider_type="inline::sqlite_vec",
provider_type="inline::sqlite-vec",
config=SQLiteVectorIOConfig(
kvstore=SqliteKVStoreConfig(db_path=temp_file.name).model_dump(),
).model_dump(),

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- remote::cerebras
- inline::sentence-transformers
safety:
- inline::llama-guard
vector_io:

View file

@ -20,7 +20,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::cerebras"],
"inference": ["remote::cerebras", "inline::sentence-transformers"],
"safety": ["inline::llama-guard"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"agents": ["inline::meta-reference"],

View file

@ -5,6 +5,7 @@ distribution_spec:
providers:
inference:
- remote::tgi
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -20,7 +20,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::tgi"],
"inference": ["remote::tgi", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- remote::fireworks
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -25,7 +25,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::fireworks"],
"inference": ["remote::fireworks", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- remote::hf::serverless
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -21,7 +21,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::serverless"],
"inference": ["remote::hf::serverless", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],

View file

@ -5,8 +5,7 @@ distribution_spec:
inference:
- remote::ollama
vector_io:
- inline::faiss
- inline::sqlite_vec
- inline::sqlite-vec
- remote::chromadb
- remote::pgvector
safety:

View file

@ -13,10 +13,6 @@ from llama_stack.distribution.datatypes import (
ShieldInput,
ToolGroupInput,
)
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@ -25,7 +21,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::ollama"],
"vector_io": ["inline::faiss", "inline::sqlite_vec", "remote::chromadb", "remote::pgvector"],
"vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
@ -45,19 +41,9 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="remote::ollama",
config=OllamaImplConfig.sample_run_config(),
)
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
vector_io_provider_faiss = Provider(
provider_id="faiss",
provider_type="inline::faiss",
config=FaissVectorIOConfig.sample_run_config(f"distributions/{name}"),
)
vector_io_provider_sqlite = Provider(
provider_id="sqlite_vec",
provider_type="inline::sqlite_vec",
provider_id="sqlite-vec",
provider_type="inline::sqlite-vec",
config=SQLiteVectorIOConfig.sample_run_config(f"distributions/{name}"),
)
@ -104,19 +90,16 @@ def get_distribution_template() -> DistributionTemplate:
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider, embedding_provider],
"vector_io": [vector_io_provider_faiss, vector_io_provider_sqlite],
"inference": [inference_provider],
"vector_io": [vector_io_provider_sqlite],
},
default_models=[inference_model, embedding_model],
default_models=[inference_model],
default_tool_groups=default_tool_groups,
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
embedding_provider,
],
"vector_io": [vector_io_provider_faiss, vector_io_provider_faiss],
"inference": [inference_provider],
"vector_io": [vector_io_provider_sqlite],
"safety": [
Provider(
provider_id="llama-guard",

View file

@ -16,24 +16,11 @@ providers:
provider_type: remote::ollama
config:
url: ${env.OLLAMA_URL:http://localhost:11434}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: faiss
provider_type: inline::faiss
- provider_id: sqlite-vec
provider_type: inline::sqlite-vec
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard

View file

@ -16,19 +16,9 @@ providers:
provider_type: remote::ollama
config:
url: ${env.OLLAMA_URL:http://localhost:11434}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
- provider_id: sqlite_vec
provider_type: inline::sqlite_vec
- provider_id: sqlite-vec
provider_type: inline::sqlite-vec
config:
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db
safety:
@ -97,12 +87,6 @@ models:
model_id: ${env.INFERENCE_MODEL}
provider_id: ollama
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: ollama
provider_model_id: all-minilm:latest
model_type: embedding
shields: []
vector_dbs: []
datasets: []

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- remote::vllm
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -23,7 +23,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::vllm"],
"inference": ["remote::vllm", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- remote::tgi
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -23,7 +23,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::tgi"],
"inference": ["remote::tgi", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- remote::together
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -25,7 +25,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::together"],
"inference": ["remote::together", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],

View file

@ -4,6 +4,7 @@ distribution_spec:
providers:
inference:
- inline::vllm
- inline::sentence-transformers
vector_io:
- inline::faiss
- remote::chromadb

View file

@ -20,7 +20,7 @@ from llama_stack.templates.template import (
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::vllm"],
"inference": ["inline::vllm", "inline::sentence-transformers"],
"vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],