From c9e5578151ae612fce300aecc4c4d9f830107dae Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 22 Jan 2025 10:17:59 -0800 Subject: [PATCH] [memory refactor][5/n] Migrate all vector_io providers (#835) See https://github.com/meta-llama/llama-stack/issues/827 for the broader design. This PR finishes off all the stragglers and migrates everything to the new naming. --- .../remote_hosted_distro/nvidia.md | 2 +- .../self_hosted_distro/bedrock.md | 2 +- .../self_hosted_distro/cerebras.md | 2 +- .../self_hosted_distro/fireworks.md | 2 +- .../self_hosted_distro/meta-reference-gpu.md | 2 +- .../meta-reference-quantized-gpu.md | 2 +- .../self_hosted_distro/ollama.md | 2 +- .../self_hosted_distro/remote-vllm.md | 2 +- .../distributions/self_hosted_distro/tgi.md | 2 +- .../self_hosted_distro/together.md | 2 +- llama_stack/apis/agents/agents.py | 2 +- llama_stack/apis/agents/event_logger.py | 2 +- llama_stack/apis/resource.py | 2 +- .../distribution/store/tests/test_registry.py | 140 +++++++++--------- .../ui/page/distribution/memory_banks.py | 23 --- .../ui/page/distribution/resources.py | 8 +- .../ui/page/distribution/vector_dbs.py | 23 +++ .../distribution/ui/page/playground/rag.py | 54 ++++--- .../agents/meta_reference/agent_instance.py | 26 ++-- .../agents/meta_reference/persistence.py | 6 +- .../meta_reference/tests/test_chat_agent.py | 116 +++------------ .../inline/vector_io/chroma/__init__.py | 6 +- .../remote/vector_io/chroma/__init__.py | 4 +- .../remote/vector_io/chroma/chroma.py | 93 ++++++------ .../remote/vector_io/pgvector/pgvector.py | 90 ++++++----- .../remote/vector_io/qdrant/qdrant.py | 83 +++++------ .../remote/vector_io/sample/sample.py | 13 +- .../remote/vector_io/weaviate/weaviate.py | 91 ++++++------ llama_stack/providers/tests/eval/fixtures.py | 4 +- llama_stack/providers/tests/tools/fixtures.py | 9 +- .../providers/tests/tools/test_tools.py | 47 +++--- llama_stack/templates/bedrock/bedrock.py | 8 +- llama_stack/templates/bedrock/build.yaml | 2 +- llama_stack/templates/bedrock/run.yaml | 6 +- llama_stack/templates/cerebras/build.yaml | 2 +- llama_stack/templates/cerebras/cerebras.py | 8 +- llama_stack/templates/cerebras/run.yaml | 6 +- .../experimental-post-training/run.yaml | 4 +- llama_stack/templates/fireworks/build.yaml | 2 +- llama_stack/templates/fireworks/fireworks.py | 10 +- .../templates/fireworks/run-with-safety.yaml | 6 +- llama_stack/templates/fireworks/run.yaml | 6 +- llama_stack/templates/hf-endpoint/build.yaml | 2 +- .../templates/hf-endpoint/hf_endpoint.py | 10 +- .../hf-endpoint/run-with-safety.yaml | 6 +- llama_stack/templates/hf-endpoint/run.yaml | 6 +- .../templates/hf-serverless/build.yaml | 2 +- .../templates/hf-serverless/hf_serverless.py | 10 +- .../hf-serverless/run-with-safety.yaml | 6 +- llama_stack/templates/hf-serverless/run.yaml | 6 +- .../templates/meta-reference-gpu/build.yaml | 2 +- .../meta-reference-gpu/meta_reference.py | 10 +- .../meta-reference-gpu/run-with-safety.yaml | 6 +- .../templates/meta-reference-gpu/run.yaml | 6 +- .../meta-reference-quantized-gpu/build.yaml | 2 +- .../meta_reference.py | 8 +- .../meta-reference-quantized-gpu/run.yaml | 6 +- llama_stack/templates/nvidia/build.yaml | 2 +- llama_stack/templates/nvidia/nvidia.py | 2 +- llama_stack/templates/nvidia/run.yaml | 6 +- llama_stack/templates/ollama/build.yaml | 2 +- llama_stack/templates/ollama/ollama.py | 10 +- .../templates/ollama/run-with-safety.yaml | 6 +- llama_stack/templates/ollama/run.yaml | 6 +- llama_stack/templates/remote-vllm/build.yaml | 2 +- .../remote-vllm/run-with-safety.yaml | 6 +- llama_stack/templates/remote-vllm/run.yaml | 6 +- llama_stack/templates/remote-vllm/vllm.py | 10 +- llama_stack/templates/tgi/build.yaml | 2 +- .../templates/tgi/run-with-safety.yaml | 6 +- llama_stack/templates/tgi/run.yaml | 6 +- llama_stack/templates/tgi/tgi.py | 10 +- .../templates/together/run-with-safety.yaml | 6 +- llama_stack/templates/together/run.yaml | 3 +- llama_stack/templates/together/together.py | 10 +- llama_stack/templates/vllm-gpu/build.yaml | 2 +- llama_stack/templates/vllm-gpu/run.yaml | 6 +- llama_stack/templates/vllm-gpu/vllm.py | 8 +- 78 files changed, 504 insertions(+), 623 deletions(-) delete mode 100644 llama_stack/distribution/ui/page/distribution/memory_banks.py create mode 100644 llama_stack/distribution/ui/page/distribution/vector_dbs.py diff --git a/docs/source/distributions/remote_hosted_distro/nvidia.md b/docs/source/distributions/remote_hosted_distro/nvidia.md index 4028ed384..e4c3a155f 100644 --- a/docs/source/distributions/remote_hosted_distro/nvidia.md +++ b/docs/source/distributions/remote_hosted_distro/nvidia.md @@ -8,11 +8,11 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::nvidia` | -| memory | `inline::faiss` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss` | ### Environment Variables diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md index dd4e51264..a66325560 100644 --- a/docs/source/distributions/self_hosted_distro/bedrock.md +++ b/docs/source/distributions/self_hosted_distro/bedrock.md @@ -15,11 +15,11 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::bedrock` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `remote::bedrock` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md index 22e4125bd..211082b7a 100644 --- a/docs/source/distributions/self_hosted_distro/cerebras.md +++ b/docs/source/distributions/self_hosted_distro/cerebras.md @@ -8,11 +8,11 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::cerebras` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | ### Environment Variables diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index 7ed174984..39043b1c1 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -18,11 +18,11 @@ The `llamastack/distribution-fireworks` distribution consists of the following p | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::fireworks` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | ### Environment Variables diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index 269354e98..8475aab3a 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -18,11 +18,11 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `inline::meta-reference` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs. diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index 937dbbdbd..6f1adb5a9 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -18,11 +18,11 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `inline::meta-reference-quantized` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc. diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index e8e5dd397..f5ba31feb 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -18,11 +18,11 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::ollama` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index 2bb5329b9..c2b3544d3 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -17,11 +17,11 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::vllm` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index 0fd6a693c..c21a6a586 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -19,11 +19,11 @@ The `llamastack/distribution-tgi` distribution consists of the following provide | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::tgi` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index e990e273f..65a711522 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -18,11 +18,11 @@ The `llamastack/distribution-together` distribution consists of the following pr | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::together` | -| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | ### Environment Variables diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 20cb8f828..c19f28054 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -88,7 +88,7 @@ class MemoryRetrievalStep(StepCommon): step_type: Literal[StepType.memory_retrieval.value] = ( StepType.memory_retrieval.value ) - memory_bank_ids: List[str] + vector_db_ids: str inserted_context: InterleavedContent diff --git a/llama_stack/apis/agents/event_logger.py b/llama_stack/apis/agents/event_logger.py index 9e2f14805..ddb2a7cf4 100644 --- a/llama_stack/apis/agents/event_logger.py +++ b/llama_stack/apis/agents/event_logger.py @@ -208,7 +208,7 @@ class EventLogger: ): details = event.payload.step_details inserted_context = interleaved_content_as_str(details.inserted_context) - content = f"fetched {len(inserted_context)} bytes from {details.memory_bank_ids}" + content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}" yield ( event, diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py index dfe3ddb24..d0ce72644 100644 --- a/llama_stack/apis/resource.py +++ b/llama_stack/apis/resource.py @@ -37,5 +37,5 @@ class Resource(BaseModel): provider_id: str = Field(description="ID of the provider that owns this resource") type: ResourceType = Field( - description="Type of resource (e.g. 'model', 'shield', 'memory_bank', etc.)" + description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)" ) diff --git a/llama_stack/distribution/store/tests/test_registry.py b/llama_stack/distribution/store/tests/test_registry.py index 9c5b72f93..78d59a088 100644 --- a/llama_stack/distribution/store/tests/test_registry.py +++ b/llama_stack/distribution/store/tests/test_registry.py @@ -9,7 +9,7 @@ import os import pytest import pytest_asyncio from llama_stack.apis.inference import Model -from llama_stack.apis.memory_banks import VectorMemoryBank +from llama_stack.apis.vector_dbs import VectorDB from llama_stack.distribution.store.registry import ( CachedDiskDistributionRegistry, @@ -42,13 +42,12 @@ async def cached_registry(config): @pytest.fixture -def sample_bank(): - return VectorMemoryBank( - identifier="test_bank", +def sample_vector_db(): + return VectorDB( + identifier="test_vector_db", embedding_model="all-MiniLM-L6-v2", - chunk_size_in_tokens=512, - overlap_size_in_tokens=64, - provider_resource_id="test_bank", + embedding_dimension=384, + provider_resource_id="test_vector_db", provider_id="test-provider", ) @@ -70,19 +69,17 @@ async def test_registry_initialization(registry): @pytest.mark.asyncio -async def test_basic_registration(registry, sample_bank, sample_model): - print(f"Registering {sample_bank}") - await registry.register(sample_bank) +async def test_basic_registration(registry, sample_vector_db, sample_model): + print(f"Registering {sample_vector_db}") + await registry.register(sample_vector_db) print(f"Registering {sample_model}") await registry.register(sample_model) - print("Getting bank") - result_bank = await registry.get("memory_bank", "test_bank") - assert result_bank is not None - assert result_bank.identifier == sample_bank.identifier - assert result_bank.embedding_model == sample_bank.embedding_model - assert result_bank.chunk_size_in_tokens == sample_bank.chunk_size_in_tokens - assert result_bank.overlap_size_in_tokens == sample_bank.overlap_size_in_tokens - assert result_bank.provider_id == sample_bank.provider_id + print("Getting vector_db") + result_vector_db = await registry.get("vector_db", "test_vector_db") + assert result_vector_db is not None + assert result_vector_db.identifier == sample_vector_db.identifier + assert result_vector_db.embedding_model == sample_vector_db.embedding_model + assert result_vector_db.provider_id == sample_vector_db.provider_id result_model = await registry.get("model", "test_model") assert result_model is not None @@ -91,24 +88,23 @@ async def test_basic_registration(registry, sample_bank, sample_model): @pytest.mark.asyncio -async def test_cached_registry_initialization(config, sample_bank, sample_model): +async def test_cached_registry_initialization(config, sample_vector_db, sample_model): # First populate the disk registry disk_registry = DiskDistributionRegistry(await kvstore_impl(config)) await disk_registry.initialize() - await disk_registry.register(sample_bank) + await disk_registry.register(sample_vector_db) await disk_registry.register(sample_model) # Test cached version loads from disk cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config)) await cached_registry.initialize() - result_bank = await cached_registry.get("memory_bank", "test_bank") - assert result_bank is not None - assert result_bank.identifier == sample_bank.identifier - assert result_bank.embedding_model == sample_bank.embedding_model - assert result_bank.chunk_size_in_tokens == sample_bank.chunk_size_in_tokens - assert result_bank.overlap_size_in_tokens == sample_bank.overlap_size_in_tokens - assert result_bank.provider_id == sample_bank.provider_id + result_vector_db = await cached_registry.get("vector_db", "test_vector_db") + assert result_vector_db is not None + assert result_vector_db.identifier == sample_vector_db.identifier + assert result_vector_db.embedding_model == sample_vector_db.embedding_model + assert result_vector_db.embedding_dimension == sample_vector_db.embedding_dimension + assert result_vector_db.provider_id == sample_vector_db.provider_id @pytest.mark.asyncio @@ -116,29 +112,28 @@ async def test_cached_registry_updates(config): cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config)) await cached_registry.initialize() - new_bank = VectorMemoryBank( - identifier="test_bank_2", + new_vector_db = VectorDB( + identifier="test_vector_db_2", embedding_model="all-MiniLM-L6-v2", - chunk_size_in_tokens=256, - overlap_size_in_tokens=32, - provider_resource_id="test_bank_2", + embedding_dimension=384, + provider_resource_id="test_vector_db_2", provider_id="baz", ) - await cached_registry.register(new_bank) + await cached_registry.register(new_vector_db) # Verify in cache - result_bank = await cached_registry.get("memory_bank", "test_bank_2") - assert result_bank is not None - assert result_bank.identifier == new_bank.identifier - assert result_bank.provider_id == new_bank.provider_id + result_vector_db = await cached_registry.get("vector_db", "test_vector_db_2") + assert result_vector_db is not None + assert result_vector_db.identifier == new_vector_db.identifier + assert result_vector_db.provider_id == new_vector_db.provider_id # Verify persisted to disk new_registry = DiskDistributionRegistry(await kvstore_impl(config)) await new_registry.initialize() - result_bank = await new_registry.get("memory_bank", "test_bank_2") - assert result_bank is not None - assert result_bank.identifier == new_bank.identifier - assert result_bank.provider_id == new_bank.provider_id + result_vector_db = await new_registry.get("vector_db", "test_vector_db_2") + assert result_vector_db is not None + assert result_vector_db.identifier == new_vector_db.identifier + assert result_vector_db.provider_id == new_vector_db.provider_id @pytest.mark.asyncio @@ -146,30 +141,28 @@ async def test_duplicate_provider_registration(config): cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config)) await cached_registry.initialize() - original_bank = VectorMemoryBank( - identifier="test_bank_2", + original_vector_db = VectorDB( + identifier="test_vector_db_2", embedding_model="all-MiniLM-L6-v2", - chunk_size_in_tokens=256, - overlap_size_in_tokens=32, - provider_resource_id="test_bank_2", + embedding_dimension=384, + provider_resource_id="test_vector_db_2", provider_id="baz", ) - await cached_registry.register(original_bank) + await cached_registry.register(original_vector_db) - duplicate_bank = VectorMemoryBank( - identifier="test_bank_2", + duplicate_vector_db = VectorDB( + identifier="test_vector_db_2", embedding_model="different-model", - chunk_size_in_tokens=128, - overlap_size_in_tokens=16, - provider_resource_id="test_bank_2", + embedding_dimension=384, + provider_resource_id="test_vector_db_2", provider_id="baz", # Same provider_id ) - await cached_registry.register(duplicate_bank) + await cached_registry.register(duplicate_vector_db) - result = await cached_registry.get("memory_bank", "test_bank_2") + result = await cached_registry.get("vector_db", "test_vector_db_2") assert result is not None assert ( - result.embedding_model == original_bank.embedding_model + result.embedding_model == original_vector_db.embedding_model ) # Original values preserved @@ -179,36 +172,35 @@ async def test_get_all_objects(config): await cached_registry.initialize() # Create multiple test banks - test_banks = [ - VectorMemoryBank( - identifier=f"test_bank_{i}", + test_vector_dbs = [ + VectorDB( + identifier=f"test_vector_db_{i}", embedding_model="all-MiniLM-L6-v2", - chunk_size_in_tokens=256, - overlap_size_in_tokens=32, - provider_resource_id=f"test_bank_{i}", + embedding_dimension=384, + provider_resource_id=f"test_vector_db_{i}", provider_id=f"provider_{i}", ) for i in range(3) ] - # Register all banks - for bank in test_banks: - await cached_registry.register(bank) + # Register all vector_dbs + for vector_db in test_vector_dbs: + await cached_registry.register(vector_db) # Test get_all retrieval all_results = await cached_registry.get_all() assert len(all_results) == 3 - # Verify each bank was stored correctly - for original_bank in test_banks: - matching_banks = [ - b for b in all_results if b.identifier == original_bank.identifier + # Verify each vector_db was stored correctly + for original_vector_db in test_vector_dbs: + matching_vector_dbs = [ + v for v in all_results if v.identifier == original_vector_db.identifier ] - assert len(matching_banks) == 1 - stored_bank = matching_banks[0] - assert stored_bank.embedding_model == original_bank.embedding_model - assert stored_bank.provider_id == original_bank.provider_id - assert stored_bank.chunk_size_in_tokens == original_bank.chunk_size_in_tokens + assert len(matching_vector_dbs) == 1 + stored_vector_db = matching_vector_dbs[0] + assert stored_vector_db.embedding_model == original_vector_db.embedding_model + assert stored_vector_db.provider_id == original_vector_db.provider_id assert ( - stored_bank.overlap_size_in_tokens == original_bank.overlap_size_in_tokens + stored_vector_db.embedding_dimension + == original_vector_db.embedding_dimension ) diff --git a/llama_stack/distribution/ui/page/distribution/memory_banks.py b/llama_stack/distribution/ui/page/distribution/memory_banks.py deleted file mode 100644 index f28010bf2..000000000 --- a/llama_stack/distribution/ui/page/distribution/memory_banks.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import streamlit as st -from modules.api import llama_stack_api - - -def memory_banks(): - st.header("Memory Banks") - memory_banks_info = { - m.identifier: m.to_dict() for m in llama_stack_api.client.memory_banks.list() - } - - if len(memory_banks_info) > 0: - selected_memory_bank = st.selectbox( - "Select a memory bank", list(memory_banks_info.keys()) - ) - st.json(memory_banks_info[selected_memory_bank]) - else: - st.info("No memory banks found") diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py index 6b3ea0e3a..38d494570 100644 --- a/llama_stack/distribution/ui/page/distribution/resources.py +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -6,10 +6,10 @@ from page.distribution.datasets import datasets from page.distribution.eval_tasks import eval_tasks -from page.distribution.memory_banks import memory_banks from page.distribution.models import models from page.distribution.scoring_functions import scoring_functions from page.distribution.shields import shields +from page.distribution.vector_dbs import vector_dbs from streamlit_option_menu import option_menu @@ -17,7 +17,7 @@ from streamlit_option_menu import option_menu def resources_page(): options = [ "Models", - "Memory Banks", + "Vector Databases", "Shields", "Scoring Functions", "Datasets", @@ -37,8 +37,8 @@ def resources_page(): ) if selected_resource == "Eval Tasks": eval_tasks() - elif selected_resource == "Memory Banks": - memory_banks() + elif selected_resource == "Vector Databases": + vector_dbs() elif selected_resource == "Datasets": datasets() elif selected_resource == "Models": diff --git a/llama_stack/distribution/ui/page/distribution/vector_dbs.py b/llama_stack/distribution/ui/page/distribution/vector_dbs.py new file mode 100644 index 000000000..9afa6de1f --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/vector_dbs.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def vector_dbs(): + st.header("Vector Databases") + vector_dbs_info = { + v.identifier: v.to_dict() for v in llama_stack_api.client.vector_dbs.list() + } + + if len(vector_dbs_info) > 0: + selected_vector_db = st.selectbox( + "Select a vector database", list(vector_dbs_info.keys()) + ) + st.json(vector_dbs_info[selected_vector_db]) + else: + st.info("No vector databases found") diff --git a/llama_stack/distribution/ui/page/playground/rag.py b/llama_stack/distribution/ui/page/playground/rag.py index 11b05718d..465e11560 100644 --- a/llama_stack/distribution/ui/page/playground/rag.py +++ b/llama_stack/distribution/ui/page/playground/rag.py @@ -29,12 +29,12 @@ def rag_chat_page(): if uploaded_files: st.success(f"Successfully uploaded {len(uploaded_files)} files") # Add memory bank name input field - memory_bank_name = st.text_input( - "Memory Bank Name", - value="rag_bank", - help="Enter a unique identifier for this memory bank", + vector_db_name = st.text_input( + "Vector Database Name", + value="rag_vector_db", + help="Enter a unique identifier for this vector database", ) - if st.button("Create Memory Bank"): + if st.button("Create Vector Database"): documents = [ Document( document_id=uploaded_file.name, @@ -44,37 +44,33 @@ def rag_chat_page(): ] providers = llama_stack_api.client.providers.list() - memory_provider = None + vector_io_provider = None for x in providers: - if x.api == "memory": - memory_provider = x.provider_id + if x.api == "vector_io": + vector_io_provider = x.provider_id - llama_stack_api.client.memory_banks.register( - memory_bank_id=memory_bank_name, # Use the user-provided name - params={ - "memory_bank_type": "vector", - "embedding_model": "all-MiniLM-L6-v2", - "chunk_size_in_tokens": 512, - "overlap_size_in_tokens": 64, - }, - provider_id=memory_provider, + llama_stack_api.client.vector_dbs.register( + vector_db_id=vector_db_name, # Use the user-provided name + embedding_dimension=384, + embedding_model="all-MiniLM-L6-v2", + provider_id=vector_io_provider, ) - # insert documents using the custom bank name - llama_stack_api.client.memory.insert( - bank_id=memory_bank_name, # Use the user-provided name + # insert documents using the custom vector db name + llama_stack_api.client.tool_runtime.rag_tool.insert( + vector_db_id=vector_db_name, # Use the user-provided name documents=documents, ) - st.success("Memory bank created successfully!") + st.success("Vector database created successfully!") st.subheader("Configure Agent") # select memory banks - memory_banks = llama_stack_api.client.memory_banks.list() - memory_banks = [bank.identifier for bank in memory_banks] - selected_memory_banks = st.multiselect( - "Select Memory Banks", - memory_banks, + vector_dbs = llama_stack_api.client.vector_dbs.list() + vector_dbs = [vector_db.identifier for vector_db in vector_dbs] + selected_vector_dbs = st.multiselect( + "Select Vector Databases", + vector_dbs, ) available_models = llama_stack_api.client.models.list() @@ -141,14 +137,14 @@ def rag_chat_page(): dict( name="builtin::memory", args={ - "memory_bank_ids": [bank_id for bank_id in selected_memory_banks], + "vector_db_ids": [ + vector_db_id for vector_db_id in selected_vector_dbs + ], }, ) ], tool_choice="auto", tool_prompt_format="json", - input_shields=[], - output_shields=[], enable_session_persistence=False, ) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 5b5175cee..2d0ad137b 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -413,8 +413,8 @@ class ChatAgent(ShieldRunnerMixin): session_info = await self.storage.get_session_info(session_id) # if the session has a memory bank id, let the memory tool use it - if session_info.memory_bank_id: - vector_db_ids.append(session_info.memory_bank_id) + if session_info.vector_db_id: + vector_db_ids.append(session_info.vector_db_id) yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( @@ -829,7 +829,7 @@ class ChatAgent(ShieldRunnerMixin): msg = await attachment_message(self.tempdir, url_items) input_messages.append(msg) # Since memory is present, add all the data to the memory bank - await self.add_to_session_memory_bank(session_id, documents) + await self.add_to_session_vector_db(session_id, documents) elif code_interpreter_tool: # if only code_interpreter is available, we download the URLs to a tempdir # and attach the path to them as a message to inference with the @@ -838,7 +838,7 @@ class ChatAgent(ShieldRunnerMixin): input_messages.append(msg) elif memory_tool: # if only memory is available, we load the data from the URLs and content items to the memory bank - await self.add_to_session_memory_bank(session_id, documents) + await self.add_to_session_vector_db(session_id, documents) else: # if no memory or code_interpreter tool is available, # we try to load the data from the URLs and content items as a message to inference @@ -848,31 +848,31 @@ class ChatAgent(ShieldRunnerMixin): + await load_data_from_urls(url_items) ) - async def _ensure_memory_bank(self, session_id: str) -> str: + async def _ensure_vector_db(self, session_id: str) -> str: session_info = await self.storage.get_session_info(session_id) if session_info is None: raise ValueError(f"Session {session_id} not found") - if session_info.memory_bank_id is None: - bank_id = f"memory_bank_{session_id}" + if session_info.vector_db_id is None: + vector_db_id = f"vector_db_{session_id}" # TODO: the semantic for registration is definitely not "creation" # so we need to fix it if we expect the agent to create a new vector db # for each session await self.vector_io_api.register_vector_db( - vector_db_id=bank_id, + vector_db_id=vector_db_id, embedding_model="all-MiniLM-L6-v2", ) - await self.storage.add_memory_bank_to_session(session_id, bank_id) + await self.storage.add_vector_db_to_session(session_id, vector_db_id) else: - bank_id = session_info.memory_bank_id + vector_db_id = session_info.vector_db_id - return bank_id + return vector_db_id - async def add_to_session_memory_bank( + async def add_to_session_vector_db( self, session_id: str, data: List[Document] ) -> None: - vector_db_id = await self._ensure_memory_bank(session_id) + vector_db_id = await self._ensure_vector_db(session_id) documents = [ RAGDocument( document_id=str(uuid.uuid4()), diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py index 58b69858b..4b8ad6d4a 100644 --- a/llama_stack/providers/inline/agents/meta_reference/persistence.py +++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py @@ -21,7 +21,7 @@ log = logging.getLogger(__name__) class AgentSessionInfo(BaseModel): session_id: str session_name: str - memory_bank_id: Optional[str] = None + vector_db_id: Optional[str] = None started_at: datetime @@ -52,12 +52,12 @@ class AgentPersistence: return AgentSessionInfo(**json.loads(value)) - async def add_memory_bank_to_session(self, session_id: str, bank_id: str): + async def add_vector_db_to_session(self, session_id: str, vector_db_id: str): session_info = await self.get_session_info(session_id) if session_info is None: raise ValueError(f"Session {session_id} not found") - session_info.memory_bank_id = bank_id + session_info.vector_db_id = vector_db_id await self.kvstore.set( key=f"session:{self.agent_id}:{session_id}", value=session_info.model_dump_json(), diff --git a/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py b/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py index a7e6efc8c..205868279 100644 --- a/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py +++ b/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py @@ -29,10 +29,9 @@ from llama_stack.apis.inference import ( SamplingParams, ToolChoice, ToolDefinition, + ToolPromptFormat, UserMessage, ) -from llama_stack.apis.memory import MemoryBank -from llama_stack.apis.memory_banks import BankParams, VectorMemoryBank from llama_stack.apis.safety import RunShieldResponse from llama_stack.apis.tools import ( Tool, @@ -40,8 +39,9 @@ from llama_stack.apis.tools import ( ToolGroup, ToolHost, ToolInvocationResult, - ToolPromptFormat, ) +from llama_stack.apis.vector_io import QueryChunksResponse + from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( MEMORY_QUERY_TOOL, ) @@ -110,68 +110,22 @@ class MockSafetyAPI: return RunShieldResponse(violation=None) -class MockMemoryAPI: +class MockVectorIOAPI: def __init__(self): - self.memory_banks = {} - self.documents = {} + self.chunks = {} - async def create_memory_bank(self, name, config, url=None): - bank_id = f"bank_{len(self.memory_banks)}" - bank = MemoryBank(bank_id, name, config, url) - self.memory_banks[bank_id] = bank - self.documents[bank_id] = {} - return bank + async def insert_chunks(self, vector_db_id, chunks, ttl_seconds=None): + for chunk in chunks: + metadata = chunk.metadata + self.chunks[vector_db_id][metadata["document_id"]] = chunk - async def list_memory_banks(self): - return list(self.memory_banks.values()) + async def query_chunks(self, vector_db_id, query, params=None): + if vector_db_id not in self.chunks: + raise ValueError(f"Bank {vector_db_id} not found") - async def get_memory_bank(self, bank_id): - return self.memory_banks.get(bank_id) - - async def drop_memory_bank(self, bank_id): - if bank_id in self.memory_banks: - del self.memory_banks[bank_id] - del self.documents[bank_id] - return bank_id - - async def insert_documents(self, bank_id, documents, ttl_seconds=None): - if bank_id not in self.documents: - raise ValueError(f"Bank {bank_id} not found") - for doc in documents: - self.documents[bank_id][doc.document_id] = doc - - async def update_documents(self, bank_id, documents): - if bank_id not in self.documents: - raise ValueError(f"Bank {bank_id} not found") - for doc in documents: - if doc.document_id in self.documents[bank_id]: - self.documents[bank_id][doc.document_id] = doc - - async def query_documents(self, bank_id, query, params=None): - if bank_id not in self.documents: - raise ValueError(f"Bank {bank_id} not found") - # Simple mock implementation: return all documents - chunks = [ - {"content": doc.content, "token_count": 10, "document_id": doc.document_id} - for doc in self.documents[bank_id].values() - ] + chunks = list(self.chunks[vector_db_id].values()) scores = [1.0] * len(chunks) - return {"chunks": chunks, "scores": scores} - - async def get_documents(self, bank_id, document_ids): - if bank_id not in self.documents: - raise ValueError(f"Bank {bank_id} not found") - return [ - self.documents[bank_id][doc_id] - for doc_id in document_ids - if doc_id in self.documents[bank_id] - ] - - async def delete_documents(self, bank_id, document_ids): - if bank_id not in self.documents: - raise ValueError(f"Bank {bank_id} not found") - for doc_id in document_ids: - self.documents[bank_id].pop(doc_id, None) + return QueryChunksResponse(chunks=chunks, scores=scores) class MockToolGroupsAPI: @@ -241,31 +195,6 @@ class MockToolRuntimeAPI: return ToolInvocationResult(content={"result": "Mock tool result"}) -class MockMemoryBanksAPI: - async def list_memory_banks(self) -> List[MemoryBank]: - return [] - - async def get_memory_bank(self, memory_bank_id: str) -> Optional[MemoryBank]: - return None - - async def register_memory_bank( - self, - memory_bank_id: str, - params: BankParams, - provider_id: Optional[str] = None, - provider_memory_bank_id: Optional[str] = None, - ) -> MemoryBank: - return VectorMemoryBank( - identifier=memory_bank_id, - provider_resource_id=provider_memory_bank_id or memory_bank_id, - embedding_model="mock_model", - chunk_size_in_tokens=512, - ) - - async def unregister_memory_bank(self, memory_bank_id: str) -> None: - pass - - @pytest.fixture def mock_inference_api(): return MockInferenceAPI() @@ -277,8 +206,8 @@ def mock_safety_api(): @pytest.fixture -def mock_memory_api(): - return MockMemoryAPI() +def mock_vector_io_api(): + return MockVectorIOAPI() @pytest.fixture @@ -291,17 +220,11 @@ def mock_tool_runtime_api(): return MockToolRuntimeAPI() -@pytest.fixture -def mock_memory_banks_api(): - return MockMemoryBanksAPI() - - @pytest.fixture async def get_agents_impl( mock_inference_api, mock_safety_api, - mock_memory_api, - mock_memory_banks_api, + mock_vector_io_api, mock_tool_runtime_api, mock_tool_groups_api, ): @@ -314,8 +237,7 @@ async def get_agents_impl( ), inference_api=mock_inference_api, safety_api=mock_safety_api, - memory_api=mock_memory_api, - memory_banks_api=mock_memory_banks_api, + vector_io_api=mock_vector_io_api, tool_runtime_api=mock_tool_runtime_api, tool_groups_api=mock_tool_groups_api, ) @@ -484,7 +406,7 @@ async def test_chat_agent_tools( toolgroups_for_turn=[ AgentToolGroupWithArgs( name=MEMORY_TOOLGROUP, - args={"memory_banks": ["test_memory_bank"]}, + args={"vector_dbs": ["test_vector_db"]}, ) ] ) diff --git a/llama_stack/providers/inline/vector_io/chroma/__init__.py b/llama_stack/providers/inline/vector_io/chroma/__init__.py index 80620c780..68e28da63 100644 --- a/llama_stack/providers/inline/vector_io/chroma/__init__.py +++ b/llama_stack/providers/inline/vector_io/chroma/__init__.py @@ -14,8 +14,10 @@ from .config import ChromaInlineImplConfig async def get_provider_impl( config: ChromaInlineImplConfig, deps: Dict[Api, ProviderSpec] ): - from llama_stack.providers.remote.memory.chroma.chroma import ChromaMemoryAdapter + from llama_stack.providers.remote.vector_io.chroma.chroma import ( + ChromaVectorIOAdapter, + ) - impl = ChromaMemoryAdapter(config, deps[Api.inference]) + impl = ChromaVectorIOAdapter(config, deps[Api.inference]) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/vector_io/chroma/__init__.py b/llama_stack/providers/remote/vector_io/chroma/__init__.py index 581d60e75..d66a93ac7 100644 --- a/llama_stack/providers/remote/vector_io/chroma/__init__.py +++ b/llama_stack/providers/remote/vector_io/chroma/__init__.py @@ -14,8 +14,8 @@ from .config import ChromaRemoteImplConfig async def get_adapter_impl( config: ChromaRemoteImplConfig, deps: Dict[Api, ProviderSpec] ): - from .chroma import ChromaMemoryAdapter + from .chroma import ChromaVectorIOAdapter - impl = ChromaMemoryAdapter(config, deps[Api.inference]) + impl = ChromaVectorIOAdapter(config, deps[Api.inference]) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index c04d775ca..724dc3f51 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -6,25 +6,20 @@ import asyncio import json import logging -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from urllib.parse import urlparse import chromadb from numpy.typing import NDArray from llama_stack.apis.inference import InterleavedContent -from llama_stack.apis.memory import ( - Chunk, - Memory, - MemoryBankDocument, - QueryDocumentsResponse, -) -from llama_stack.apis.memory_banks import MemoryBank, MemoryBankType -from llama_stack.providers.datatypes import Api, MemoryBanksProtocolPrivate -from llama_stack.providers.inline.memory.chroma import ChromaInlineImplConfig +from llama_stack.apis.vector_dbs import VectorDB +from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO +from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate +from llama_stack.providers.inline.vector_io.chroma import ChromaInlineImplConfig from llama_stack.providers.utils.memory.vector_store import ( - BankWithIndex, EmbeddingIndex, + VectorDBWithIndex, ) from .config import ChromaRemoteImplConfig @@ -61,7 +56,7 @@ class ChromaIndex(EmbeddingIndex): async def query( self, embedding: NDArray, k: int, score_threshold: float - ) -> QueryDocumentsResponse: + ) -> QueryChunksResponse: results = await maybe_await( self.collection.query( query_embeddings=[embedding.tolist()], @@ -85,19 +80,19 @@ class ChromaIndex(EmbeddingIndex): chunks.append(chunk) scores.append(1.0 / float(dist)) - return QueryDocumentsResponse(chunks=chunks, scores=scores) + return QueryChunksResponse(chunks=chunks, scores=scores) async def delete(self): await maybe_await(self.client.delete_collection(self.collection.name)) -class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate): +class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__( self, config: Union[ChromaRemoteImplConfig, ChromaInlineImplConfig], inference_api: Api.inference, ) -> None: - log.info(f"Initializing ChromaMemoryAdapter with url: {config}") + log.info(f"Initializing ChromaVectorIOAdapter with url: {config}") self.config = config self.inference_api = inference_api @@ -123,60 +118,58 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate): async def shutdown(self) -> None: pass - async def register_memory_bank( + async def register_vector_db( self, - memory_bank: MemoryBank, + vector_db: VectorDB, ) -> None: - assert ( - memory_bank.memory_bank_type == MemoryBankType.vector.value - ), f"Only vector banks are supported {memory_bank.memory_bank_type}" - collection = await maybe_await( self.client.get_or_create_collection( - name=memory_bank.identifier, - metadata={"bank": memory_bank.model_dump_json()}, + name=vector_db.identifier, + metadata={"vector_db": vector_db.model_dump_json()}, ) ) - self.cache[memory_bank.identifier] = BankWithIndex( - memory_bank, ChromaIndex(self.client, collection), self.inference_api + self.cache[vector_db.identifier] = VectorDBWithIndex( + vector_db, ChromaIndex(self.client, collection), self.inference_api ) - async def unregister_memory_bank(self, memory_bank_id: str) -> None: - await self.cache[memory_bank_id].index.delete() - del self.cache[memory_bank_id] + async def unregister_vector_db(self, vector_db_id: str) -> None: + await self.cache[vector_db_id].index.delete() + del self.cache[vector_db_id] - async def insert_documents( + async def insert_chunks( self, - bank_id: str, - documents: List[MemoryBankDocument], - ttl_seconds: Optional[int] = None, + vector_db_id: str, + chunks: List[Chunk], + embeddings: NDArray, ) -> None: - index = await self._get_and_cache_bank_index(bank_id) + index = await self._get_and_cache_vector_db_index(vector_db_id) - await index.insert_documents(documents) + await index.insert_chunks(chunks, embeddings) - async def query_documents( + async def query_chunks( self, - bank_id: str, + vector_db_id: str, query: InterleavedContent, params: Optional[Dict[str, Any]] = None, - ) -> QueryDocumentsResponse: - index = await self._get_and_cache_bank_index(bank_id) + ) -> QueryChunksResponse: + index = await self._get_and_cache_vector_db_index(vector_db_id) - return await index.query_documents(query, params) + return await index.query_chunks(query, params) - async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex: - if bank_id in self.cache: - return self.cache[bank_id] + async def _get_and_cache_vector_db_index( + self, vector_db_id: str + ) -> VectorDBWithIndex: + if vector_db_id in self.cache: + return self.cache[vector_db_id] - bank = await self.memory_bank_store.get_memory_bank(bank_id) - if not bank: - raise ValueError(f"Bank {bank_id} not found in Llama Stack") - collection = await maybe_await(self.client.get_collection(bank_id)) + vector_db = await self.vector_db_store.get_vector_db(vector_db_id) + if not vector_db: + raise ValueError(f"Vector DB {vector_db_id} not found in Llama Stack") + collection = await maybe_await(self.client.get_collection(vector_db_id)) if not collection: - raise ValueError(f"Bank {bank_id} not found in Chroma") - index = BankWithIndex( - bank, ChromaIndex(self.client, collection), self.inference_api + raise ValueError(f"Vector DB {vector_db_id} not found in Chroma") + index = VectorDBWithIndex( + vector_db, ChromaIndex(self.client, collection), self.inference_api ) - self.cache[bank_id] = index + self.cache[vector_db_id] = index return index diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py index b2c720b2c..3605f038c 100644 --- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py +++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py @@ -12,21 +12,16 @@ from numpy.typing import NDArray from psycopg2 import sql from psycopg2.extras import execute_values, Json -from pydantic import BaseModel, parse_obj_as +from pydantic import BaseModel, TypeAdapter from llama_stack.apis.inference import InterleavedContent -from llama_stack.apis.memory import ( - Chunk, - Memory, - MemoryBankDocument, - QueryDocumentsResponse, -) -from llama_stack.apis.memory_banks import MemoryBank, MemoryBankType, VectorMemoryBank -from llama_stack.providers.datatypes import Api, MemoryBanksProtocolPrivate +from llama_stack.apis.vector_dbs import VectorDB +from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO +from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.utils.memory.vector_store import ( - BankWithIndex, EmbeddingIndex, + VectorDBWithIndex, ) from .config import PGVectorConfig @@ -50,20 +45,20 @@ def upsert_models(cur, keys_models: List[Tuple[str, BaseModel]]): """ ) - values = [(key, Json(model.dict())) for key, model in keys_models] + values = [(key, Json(model.model_dump())) for key, model in keys_models] execute_values(cur, query, values, template="(%s, %s)") def load_models(cur, cls): cur.execute("SELECT key, data FROM metadata_store") rows = cur.fetchall() - return [parse_obj_as(cls, row["data"]) for row in rows] + return [TypeAdapter(cls).validate_python(row["data"]) for row in rows] class PGVectorIndex(EmbeddingIndex): - def __init__(self, bank: VectorMemoryBank, dimension: int, cursor): + def __init__(self, vector_db: VectorDB, dimension: int, cursor): self.cursor = cursor - self.table_name = f"vector_store_{bank.identifier}" + self.table_name = f"vector_store_{vector_db.identifier}" self.cursor.execute( f""" @@ -85,7 +80,7 @@ class PGVectorIndex(EmbeddingIndex): values.append( ( f"{chunk.document_id}:chunk-{i}", - Json(chunk.dict()), + Json(chunk.model_dump()), embeddings[i].tolist(), ) ) @@ -101,7 +96,7 @@ class PGVectorIndex(EmbeddingIndex): async def query( self, embedding: NDArray, k: int, score_threshold: float - ) -> QueryDocumentsResponse: + ) -> QueryChunksResponse: self.cursor.execute( f""" SELECT document, embedding <-> %s::vector AS distance @@ -119,13 +114,13 @@ class PGVectorIndex(EmbeddingIndex): chunks.append(Chunk(**doc)) scores.append(1.0 / float(dist)) - return QueryDocumentsResponse(chunks=chunks, scores=scores) + return QueryChunksResponse(chunks=chunks, scores=scores) async def delete(self): self.cursor.execute(f"DROP TABLE IF EXISTS {self.table_name}") -class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate): +class PGVectorVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__(self, config: PGVectorConfig, inference_api: Api.inference) -> None: self.config = config self.inference_api = inference_api @@ -167,46 +162,45 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate): async def shutdown(self) -> None: pass - async def register_memory_bank(self, memory_bank: MemoryBank) -> None: - assert ( - memory_bank.memory_bank_type == MemoryBankType.vector.value - ), f"Only vector banks are supported {memory_bank.memory_bank_type}" + async def register_vector_db(self, vector_db: VectorDB) -> None: + upsert_models(self.cursor, [(vector_db.identifier, vector_db)]) - upsert_models(self.cursor, [(memory_bank.identifier, memory_bank)]) - index = PGVectorIndex(memory_bank, memory_bank.embedding_dimension, self.cursor) - self.cache[memory_bank.identifier] = BankWithIndex( - memory_bank, index, self.inference_api + index = PGVectorIndex(vector_db, vector_db.embedding_dimension, self.cursor) + self.cache[vector_db.identifier] = VectorDBWithIndex( + vector_db, index, self.inference_api ) - async def unregister_memory_bank(self, memory_bank_id: str) -> None: - await self.cache[memory_bank_id].index.delete() - del self.cache[memory_bank_id] + async def unregister_vector_db(self, vector_db_id: str) -> None: + await self.cache[vector_db_id].index.delete() + del self.cache[vector_db_id] - async def insert_documents( + async def insert_chunks( self, - bank_id: str, - documents: List[MemoryBankDocument], + vector_db_id: str, + chunks: List[Chunk], ttl_seconds: Optional[int] = None, ) -> None: - index = await self._get_and_cache_bank_index(bank_id) - await index.insert_documents(documents) + index = await self._get_and_cache_vector_db_index(vector_db_id) + await index.insert_chunks(chunks) - async def query_documents( + async def query_chunks( self, - bank_id: str, + vector_db_id: str, query: InterleavedContent, params: Optional[Dict[str, Any]] = None, - ) -> QueryDocumentsResponse: - index = await self._get_and_cache_bank_index(bank_id) - return await index.query_documents(query, params) + ) -> QueryChunksResponse: + index = await self._get_and_cache_vector_db_index(vector_db_id) + return await index.query_chunks(query, params) - self.inference_api = inference_api + async def _get_and_cache_vector_db_index( + self, vector_db_id: str + ) -> VectorDBWithIndex: + if vector_db_id in self.cache: + return self.cache[vector_db_id] - async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex: - if bank_id in self.cache: - return self.cache[bank_id] - - bank = await self.memory_bank_store.get_memory_bank(bank_id) - index = PGVectorIndex(bank, bank.embedding_dimension, self.cursor) - self.cache[bank_id] = BankWithIndex(bank, index, self.inference_api) - return self.cache[bank_id] + vector_db = await self.vector_db_store.get_vector_db(vector_db_id) + index = PGVectorIndex(vector_db, vector_db.embedding_dimension, self.cursor) + self.cache[vector_db_id] = VectorDBWithIndex( + vector_db, index, self.inference_api + ) + return self.cache[vector_db_id] diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index b1d5bd7fa..d3257b4c9 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -13,19 +13,14 @@ from qdrant_client import AsyncQdrantClient, models from qdrant_client.models import PointStruct from llama_stack.apis.inference import InterleavedContent -from llama_stack.apis.memory import ( - Chunk, - Memory, - MemoryBankDocument, - QueryDocumentsResponse, -) -from llama_stack.apis.memory_banks import MemoryBank, MemoryBankType -from llama_stack.providers.datatypes import Api, MemoryBanksProtocolPrivate -from llama_stack.providers.remote.memory.qdrant.config import QdrantConfig +from llama_stack.apis.vector_dbs import VectorDB +from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO +from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.utils.memory.vector_store import ( - BankWithIndex, EmbeddingIndex, + VectorDBWithIndex, ) +from .config import QdrantConfig log = logging.getLogger(__name__) CHUNK_ID_KEY = "_chunk_id" @@ -76,7 +71,7 @@ class QdrantIndex(EmbeddingIndex): async def query( self, embedding: NDArray, k: int, score_threshold: float - ) -> QueryDocumentsResponse: + ) -> QueryChunksResponse: results = ( await self.client.query_points( collection_name=self.collection_name, @@ -101,10 +96,10 @@ class QdrantIndex(EmbeddingIndex): chunks.append(chunk) scores.append(point.score) - return QueryDocumentsResponse(chunks=chunks, scores=scores) + return QueryChunksResponse(chunks=chunks, scores=scores) -class QdrantVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate): +class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None: self.config = config self.client = AsyncQdrantClient(**self.config.model_dump(exclude_none=True)) @@ -117,58 +112,56 @@ class QdrantVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate): async def shutdown(self) -> None: self.client.close() - async def register_memory_bank( + async def register_vector_db( self, - memory_bank: MemoryBank, + vector_db: VectorDB, ) -> None: - assert ( - memory_bank.memory_bank_type == MemoryBankType.vector - ), f"Only vector banks are supported {memory_bank.memory_bank_type}" - - index = BankWithIndex( - bank=memory_bank, - index=QdrantIndex(self.client, memory_bank.identifier), + index = VectorDBWithIndex( + vector_db=vector_db, + index=QdrantIndex(self.client, vector_db.identifier), inference_api=self.inference_api, ) - self.cache[memory_bank.identifier] = index + self.cache[vector_db.identifier] = index - async def _get_and_cache_bank_index(self, bank_id: str) -> Optional[BankWithIndex]: - if bank_id in self.cache: - return self.cache[bank_id] + async def _get_and_cache_vector_db_index( + self, vector_db_id: str + ) -> Optional[VectorDBWithIndex]: + if vector_db_id in self.cache: + return self.cache[vector_db_id] - bank = await self.memory_bank_store.get_memory_bank(bank_id) - if not bank: - raise ValueError(f"Bank {bank_id} not found") + vector_db = await self.vector_db_store.get_vector_db(vector_db_id) + if not vector_db: + raise ValueError(f"Vector DB {vector_db_id} not found") - index = BankWithIndex( - bank=bank, - index=QdrantIndex(client=self.client, collection_name=bank_id), + index = VectorDBWithIndex( + vector_db=vector_db, + index=QdrantIndex(client=self.client, collection_name=vector_db.identifier), inference_api=self.inference_api, ) - self.cache[bank_id] = index + self.cache[vector_db_id] = index return index - async def insert_documents( + async def insert_chunks( self, - bank_id: str, - documents: List[MemoryBankDocument], + vector_db_id: str, + chunks: List[Chunk], ttl_seconds: Optional[int] = None, ) -> None: - index = await self._get_and_cache_bank_index(bank_id) + index = await self._get_and_cache_vector_db_index(vector_db_id) if not index: - raise ValueError(f"Bank {bank_id} not found") + raise ValueError(f"Vector DB {vector_db_id} not found") - await index.insert_documents(documents) + await index.insert_chunks(chunks) - async def query_documents( + async def query_chunks( self, - bank_id: str, + vector_db_id: str, query: InterleavedContent, params: Optional[Dict[str, Any]] = None, - ) -> QueryDocumentsResponse: - index = await self._get_and_cache_bank_index(bank_id) + ) -> QueryChunksResponse: + index = await self._get_and_cache_vector_db_index(vector_db_id) if not index: - raise ValueError(f"Bank {bank_id} not found") + raise ValueError(f"Vector DB {vector_db_id} not found") - return await index.query_documents(query, params) + return await index.query_chunks(query, params) diff --git a/llama_stack/providers/remote/vector_io/sample/sample.py b/llama_stack/providers/remote/vector_io/sample/sample.py index b051eb544..e311be39d 100644 --- a/llama_stack/providers/remote/vector_io/sample/sample.py +++ b/llama_stack/providers/remote/vector_io/sample/sample.py @@ -4,19 +4,22 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.apis.memory import Memory -from llama_stack.apis.memory_banks import MemoryBank +from llama_stack.apis.vector_dbs import VectorDB +from llama_stack.apis.vector_io import VectorIO from .config import SampleConfig -class SampleMemoryImpl(Memory): +class SampleMemoryImpl(VectorIO): def __init__(self, config: SampleConfig): self.config = config - async def register_memory_bank(self, memory_bank: MemoryBank) -> None: - # these are the memory banks the Llama Stack will use to route requests to this provider + async def register_vector_db(self, vector_db: VectorDB) -> None: + # these are the vector dbs the Llama Stack will use to route requests to this provider # perform validation here if necessary pass async def initialize(self): pass + + async def shutdown(self): + pass diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py index f1433090d..ea9ce5185 100644 --- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py +++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py @@ -15,18 +15,13 @@ from weaviate.classes.init import Auth from weaviate.classes.query import Filter from llama_stack.apis.common.content_types import InterleavedContent -from llama_stack.apis.memory import ( - Chunk, - Memory, - MemoryBankDocument, - QueryDocumentsResponse, -) -from llama_stack.apis.memory_banks import MemoryBank, MemoryBankType +from llama_stack.apis.vector_dbs import VectorDB +from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO from llama_stack.distribution.request_headers import NeedsRequestProviderData -from llama_stack.providers.datatypes import Api, MemoryBanksProtocolPrivate +from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.utils.memory.vector_store import ( - BankWithIndex, EmbeddingIndex, + VectorDBWithIndex, ) from .config import WeaviateConfig, WeaviateRequestProviderData @@ -49,7 +44,7 @@ class WeaviateIndex(EmbeddingIndex): data_objects.append( wvc.data.DataObject( properties={ - "chunk_content": chunk.json(), + "chunk_content": chunk.model_dump_json(), }, vector=embeddings[i].tolist(), ) @@ -63,7 +58,7 @@ class WeaviateIndex(EmbeddingIndex): async def query( self, embedding: NDArray, k: int, score_threshold: float - ) -> QueryDocumentsResponse: + ) -> QueryChunksResponse: collection = self.client.collections.get(self.collection_name) results = collection.query.near_vector( @@ -86,7 +81,7 @@ class WeaviateIndex(EmbeddingIndex): chunks.append(chunk) scores.append(1.0 / doc.metadata.distance) - return QueryDocumentsResponse(chunks=chunks, scores=scores) + return QueryChunksResponse(chunks=chunks, scores=scores) async def delete(self, chunk_ids: List[str]) -> None: collection = self.client.collections.get(self.collection_name) @@ -96,9 +91,9 @@ class WeaviateIndex(EmbeddingIndex): class WeaviateMemoryAdapter( - Memory, + VectorIO, NeedsRequestProviderData, - MemoryBanksProtocolPrivate, + VectorDBsProtocolPrivate, ): def __init__(self, config: WeaviateConfig, inference_api: Api.inference) -> None: self.config = config @@ -129,20 +124,16 @@ class WeaviateMemoryAdapter( for client in self.client_cache.values(): client.close() - async def register_memory_bank( + async def register_vector_db( self, - memory_bank: MemoryBank, + vector_db: VectorDB, ) -> None: - assert ( - memory_bank.memory_bank_type == MemoryBankType.vector.value - ), f"Only vector banks are supported {memory_bank.memory_bank_type}" - client = self._get_client() # Create collection if it doesn't exist - if not client.collections.exists(memory_bank.identifier): + if not client.collections.exists(vector_db.identifier): client.collections.create( - name=memory_bank.identifier, + name=vector_db.identifier, vectorizer_config=wvc.config.Configure.Vectorizer.none(), properties=[ wvc.config.Property( @@ -152,52 +143,54 @@ class WeaviateMemoryAdapter( ], ) - self.cache[memory_bank.identifier] = BankWithIndex( - memory_bank, - WeaviateIndex(client=client, collection_name=memory_bank.identifier), + self.cache[vector_db.identifier] = VectorDBWithIndex( + vector_db, + WeaviateIndex(client=client, collection_name=vector_db.identifier), self.inference_api, ) - async def _get_and_cache_bank_index(self, bank_id: str) -> Optional[BankWithIndex]: - if bank_id in self.cache: - return self.cache[bank_id] + async def _get_and_cache_vector_db_index( + self, vector_db_id: str + ) -> Optional[VectorDBWithIndex]: + if vector_db_id in self.cache: + return self.cache[vector_db_id] - bank = await self.memory_bank_store.get_memory_bank(bank_id) - if not bank: - raise ValueError(f"Bank {bank_id} not found") + vector_db = await self.vector_db_store.get_vector_db(vector_db_id) + if not vector_db: + raise ValueError(f"Vector DB {vector_db_id} not found") client = self._get_client() - if not client.collections.exists(bank.identifier): - raise ValueError(f"Collection with name `{bank.identifier}` not found") + if not client.collections.exists(vector_db.identifier): + raise ValueError(f"Collection with name `{vector_db.identifier}` not found") - index = BankWithIndex( - bank=bank, - index=WeaviateIndex(client=client, collection_name=bank_id), + index = VectorDBWithIndex( + vector_db=vector_db, + index=WeaviateIndex(client=client, collection_name=vector_db.identifier), inference_api=self.inference_api, ) - self.cache[bank_id] = index + self.cache[vector_db_id] = index return index - async def insert_documents( + async def insert_chunks( self, - bank_id: str, - documents: List[MemoryBankDocument], + vector_db_id: str, + chunks: List[Chunk], ttl_seconds: Optional[int] = None, ) -> None: - index = await self._get_and_cache_bank_index(bank_id) + index = await self._get_and_cache_vector_db_index(vector_db_id) if not index: - raise ValueError(f"Bank {bank_id} not found") + raise ValueError(f"Vector DB {vector_db_id} not found") - await index.insert_documents(documents) + await index.insert_chunks(chunks) - async def query_documents( + async def query_chunks( self, - bank_id: str, + vector_db_id: str, query: InterleavedContent, params: Optional[Dict[str, Any]] = None, - ) -> QueryDocumentsResponse: - index = await self._get_and_cache_bank_index(bank_id) + ) -> QueryChunksResponse: + index = await self._get_and_cache_vector_db_index(vector_db_id) if not index: - raise ValueError(f"Bank {bank_id} not found") + raise ValueError(f"Vector DB {vector_db_id} not found") - return await index.query_documents(query, params) + return await index.query_chunks(query, params) diff --git a/llama_stack/providers/tests/eval/fixtures.py b/llama_stack/providers/tests/eval/fixtures.py index 37bb0527a..009e65fb3 100644 --- a/llama_stack/providers/tests/eval/fixtures.py +++ b/llama_stack/providers/tests/eval/fixtures.py @@ -53,7 +53,7 @@ async def eval_stack( "inference", "agents", "safety", - "memory", + "vector_io", "tool_runtime", ]: fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}") @@ -69,7 +69,7 @@ async def eval_stack( Api.scoring, Api.agents, Api.safety, - Api.memory, + Api.vector_io, Api.tool_runtime, ], providers, diff --git a/llama_stack/providers/tests/tools/fixtures.py b/llama_stack/providers/tests/tools/fixtures.py index a559dbf8c..03752881a 100644 --- a/llama_stack/providers/tests/tools/fixtures.py +++ b/llama_stack/providers/tests/tools/fixtures.py @@ -83,7 +83,7 @@ async def tools_stack( providers = {} provider_data = {} - for key in ["inference", "memory", "tool_runtime"]: + for key in ["inference", "vector_io", "tool_runtime"]: fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}") providers[key] = fixture.providers if key == "inference": @@ -117,7 +117,12 @@ async def tools_stack( ) test_stack = await construct_stack_for_test( - [Api.tool_groups, Api.inference, Api.memory, Api.tool_runtime], + [ + Api.tool_groups, + Api.inference, + Api.vector_io, + Api.tool_runtime, + ], providers, provider_data, models=models, diff --git a/llama_stack/providers/tests/tools/test_tools.py b/llama_stack/providers/tests/tools/test_tools.py index 16081b939..62b18ea66 100644 --- a/llama_stack/providers/tests/tools/test_tools.py +++ b/llama_stack/providers/tests/tools/test_tools.py @@ -8,10 +8,7 @@ import os import pytest -from llama_stack.apis.inference import UserMessage -from llama_stack.apis.memory import MemoryBankDocument -from llama_stack.apis.memory_banks import VectorMemoryBankParams -from llama_stack.apis.tools import ToolInvocationResult +from llama_stack.apis.tools import RAGDocument, RAGQueryResult, ToolInvocationResult from llama_stack.providers.datatypes import Api @@ -36,7 +33,7 @@ def sample_documents(): "lora_finetune.rst", ] return [ - MemoryBankDocument( + RAGDocument( document_id=f"num-{i}", content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}", mime_type="text/plain", @@ -57,7 +54,7 @@ class TestTools: # Execute the tool response = await tools_impl.invoke_tool( - tool_name="web_search", args={"query": sample_search_query} + tool_name="web_search", kwargs={"query": sample_search_query} ) # Verify the response @@ -75,7 +72,7 @@ class TestTools: tools_impl = tools_stack.impls[Api.tool_runtime] response = await tools_impl.invoke_tool( - tool_name="wolfram_alpha", args={"query": sample_wolfram_alpha_query} + tool_name="wolfram_alpha", kwargs={"query": sample_wolfram_alpha_query} ) # Verify the response @@ -85,43 +82,33 @@ class TestTools: assert isinstance(response.content, str) @pytest.mark.asyncio - async def test_memory_tool(self, tools_stack, sample_documents): + async def test_rag_tool(self, tools_stack, sample_documents): """Test the memory tool functionality.""" - memory_banks_impl = tools_stack.impls[Api.memory_banks] - memory_impl = tools_stack.impls[Api.memory] + vector_dbs_impl = tools_stack.impls[Api.vector_dbs] tools_impl = tools_stack.impls[Api.tool_runtime] # Register memory bank - await memory_banks_impl.register_memory_bank( - memory_bank_id="test_bank", - params=VectorMemoryBankParams( - embedding_model="all-MiniLM-L6-v2", - chunk_size_in_tokens=512, - overlap_size_in_tokens=64, - ), + await vector_dbs_impl.register( + vector_db_id="test_bank", + embedding_model="all-MiniLM-L6-v2", + embedding_dimension=384, provider_id="faiss", ) # Insert documents into memory - await memory_impl.insert_documents( - bank_id="test_bank", + await tools_impl.rag_tool.insert_documents( documents=sample_documents, + vector_db_id="test_bank", + chunk_size_in_tokens=512, ) # Execute the memory tool - response = await tools_impl.invoke_tool( - tool_name="memory", - args={ - "messages": [ - UserMessage( - content="What are the main topics covered in the documentation?", - ) - ], - "memory_bank_ids": ["test_bank"], - }, + response = await tools_impl.rag_tool.query_context( + content="What are the main topics covered in the documentation?", + vector_db_ids=["test_bank"], ) # Verify the response - assert isinstance(response, ToolInvocationResult) + assert isinstance(response, RAGQueryResult) assert response.content is not None assert len(response.content) > 0 diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py index 668134be8..20f670891 100644 --- a/llama_stack/templates/bedrock/bedrock.py +++ b/llama_stack/templates/bedrock/bedrock.py @@ -10,7 +10,7 @@ from llama_models.sku_list import all_registered_models from llama_stack.apis.models import ModelInput from llama_stack.distribution.datatypes import Provider, ToolGroupInput -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.bedrock.bedrock import MODEL_ALIASES from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -18,7 +18,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::bedrock"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["remote::bedrock"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -34,7 +34,7 @@ def get_distribution_template() -> DistributionTemplate: ], } name = "bedrock" - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -78,7 +78,7 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=default_models, default_tool_groups=default_tool_groups, diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml index 95b8684e3..9ae11e9bb 100644 --- a/llama_stack/templates/bedrock/build.yaml +++ b/llama_stack/templates/bedrock/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::bedrock - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 118723bbc..577263bbf 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -5,17 +5,17 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: bedrock provider_type: remote::bedrock config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -104,7 +104,7 @@ models: provider_model_id: meta.llama3-1-405b-instruct-v1:0 model_type: llm shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml index 9f187d3c7..6d43ed0ca 100644 --- a/llama_stack/templates/cerebras/build.yaml +++ b/llama_stack/templates/cerebras/build.yaml @@ -6,7 +6,7 @@ distribution_spec: - remote::cerebras safety: - inline::llama-guard - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py index 8f6bd77af..be51e635d 100644 --- a/llama_stack/templates/cerebras/cerebras.py +++ b/llama_stack/templates/cerebras/cerebras.py @@ -13,7 +13,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupIn from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig from llama_stack.providers.remote.inference.cerebras.cerebras import model_aliases from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -23,7 +23,7 @@ def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::cerebras"], "safety": ["inline::llama-guard"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "agents": ["inline::meta-reference"], "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], @@ -68,7 +68,7 @@ def get_distribution_template() -> DistributionTemplate: "embedding_dimension": 384, }, ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -100,7 +100,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=default_models + [embedding_model], default_shields=[], diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index bfc492bda..0553f0749 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: cerebras @@ -24,7 +24,7 @@ providers: - provider_id: llama-guard provider_type: inline::llama-guard config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -106,7 +106,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml index 87465137f..14323573c 100644 --- a/llama_stack/templates/experimental-post-training/run.yaml +++ b/llama_stack/templates/experimental-post-training/run.yaml @@ -60,7 +60,7 @@ providers: - provider_id: llama-guard provider_type: inline::llama-guard config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -82,7 +82,7 @@ metadata_store: db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db models: [] shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml index d8e1e27ee..7e19cd5e6 100644 --- a/llama_stack/templates/fireworks/build.yaml +++ b/llama_stack/templates/fireworks/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::fireworks - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py index 14fd392c4..5f1b9e8a0 100644 --- a/llama_stack/templates/fireworks/fireworks.py +++ b/llama_stack/templates/fireworks/fireworks.py @@ -18,7 +18,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -27,7 +27,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::fireworks"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -55,7 +55,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -107,7 +107,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=default_models + [embedding_model], default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], @@ -119,7 +119,7 @@ def get_distribution_template() -> DistributionTemplate: inference_provider, embedding_provider, ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], "safety": [ Provider( provider_id="llama-guard", diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index dd21120ed..659ec5191 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: fireworks @@ -20,7 +20,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -161,7 +161,7 @@ shields: provider_id: llama-guard-vision - shield_id: CodeScanner provider_id: code-scanner -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 993417b50..9fb61f842 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: fireworks @@ -20,7 +20,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -150,7 +150,7 @@ models: model_type: embedding shields: - shield_id: meta-llama/Llama-Guard-3-8B -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml index f4fdc4a3d..82a460bd9 100644 --- a/llama_stack/templates/hf-endpoint/build.yaml +++ b/llama_stack/templates/hf-endpoint/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::hf::endpoint - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py index 1a5c23a42..f9bfe85f9 100644 --- a/llama_stack/templates/hf-endpoint/hf_endpoint.py +++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py @@ -14,7 +14,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -22,7 +22,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::hf::endpoint"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -48,7 +48,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -97,7 +97,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, @@ -115,7 +115,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 537e4024f..dfa094fe6 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: hf-endpoint @@ -25,7 +25,7 @@ providers: config: endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME} api_token: ${env.HF_API_TOKEN} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -113,7 +113,7 @@ models: model_type: embedding shields: - shield_id: ${env.SAFETY_MODEL} -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index b31f28434..fb5d7fa31 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: hf-endpoint @@ -20,7 +20,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -103,7 +103,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml index d075a7449..0eb4e0509 100644 --- a/llama_stack/templates/hf-serverless/build.yaml +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::hf::serverless - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py index 0292f13e2..4f3c29404 100644 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -14,7 +14,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -22,7 +22,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::hf::serverless"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -49,7 +49,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -98,7 +98,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, @@ -116,7 +116,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 484b2d0bd..0575efaef 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: hf-serverless @@ -25,7 +25,7 @@ providers: config: huggingface_repo: ${env.SAFETY_MODEL} api_token: ${env.HF_API_TOKEN} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -113,7 +113,7 @@ models: model_type: embedding shields: - shield_id: ${env.SAFETY_MODEL} -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index a75baf1f9..b87edd744 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: hf-serverless @@ -20,7 +20,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -103,7 +103,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml index a75d3604b..f5371f0d6 100644 --- a/llama_stack/templates/meta-reference-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-gpu/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - inline::meta-reference - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py index 584d38256..dae4f0218 100644 --- a/llama_stack/templates/meta-reference-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -19,14 +19,14 @@ from llama_stack.providers.inline.inference.meta_reference import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["inline::meta-reference"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -55,7 +55,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -103,7 +103,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, @@ -122,7 +122,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 9dbdb6fa5..54ddef155 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: meta-reference-inference @@ -27,7 +27,7 @@ providers: model: ${env.SAFETY_MODEL} max_seq_len: 4096 checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:null} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -115,7 +115,7 @@ models: model_type: embedding shields: - shield_id: ${env.SAFETY_MODEL} -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 6465215f0..cde581d19 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: meta-reference-inference @@ -21,7 +21,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -104,7 +104,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml index 4c3e2f492..aa23ad313 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - inline::meta-reference-quantized - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py index 56293f42c..4e9cbf1fe 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -14,14 +14,14 @@ from llama_stack.providers.inline.inference.meta_reference import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["inline::meta-reference-quantized"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -64,7 +64,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -93,7 +93,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 059034741..cc5793f8f 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: meta-reference-inference @@ -23,7 +23,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -106,7 +106,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index 7bd2a3865..d6a510e2e 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::nvidia - memory: + vector_io: - inline::faiss safety: - inline::llama-guard diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index e72fe359f..5693ba12d 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -17,7 +17,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::nvidia"], - "memory": ["inline::faiss"], + "vector_io": ["inline::faiss"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 07c901371..317aa1031 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: nvidia @@ -17,7 +17,7 @@ providers: config: url: https://integrate.api.nvidia.com api_key: ${env.NVIDIA_API_KEY} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -136,7 +136,7 @@ models: provider_model_id: meta/llama-3.2-90b-vision-instruct model_type: llm shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 5f2e010ee..c3ed88fb8 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::ollama - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index 2288ea3a6..bdbd1e142 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -16,7 +16,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -24,7 +24,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::ollama"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -49,7 +49,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -98,7 +98,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, @@ -109,7 +109,7 @@ def get_distribution_template() -> DistributionTemplate: inference_provider, embedding_provider, ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], "safety": [ Provider( provider_id="llama-guard", diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index a808590c3..afb0b1938 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: ollama @@ -19,7 +19,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -110,7 +110,7 @@ shields: provider_id: llama-guard - shield_id: CodeScanner provider_id: code-scanner -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 2c69296fc..976068670 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: ollama @@ -19,7 +19,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -99,7 +99,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml index 6f301914c..409b2ba10 100644 --- a/llama_stack/templates/remote-vllm/build.yaml +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::vllm - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 5e5bd6af6..e26d0f99f 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: vllm-inference @@ -27,7 +27,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -115,7 +115,7 @@ models: model_type: embedding shields: - shield_id: ${env.SAFETY_MODEL} -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 4eac4dad7..dc54d216d 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: vllm-inference @@ -21,7 +21,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -104,7 +104,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index 296e2b4f5..f91ad24a7 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -16,7 +16,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -24,7 +24,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::vllm"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "eval": ["inline::meta-reference"], @@ -52,7 +52,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -100,7 +100,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, @@ -118,7 +118,7 @@ def get_distribution_template() -> DistributionTemplate: ), embedding_provider, ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 4391ddd5d..bc31ef7e7 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - remote::tgi - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index 9bd06d650..ea8057137 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: tgi-inference @@ -20,7 +20,7 @@ providers: provider_type: remote::tgi config: url: ${env.TGI_SAFETY_URL} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -103,7 +103,7 @@ models: model_type: llm shields: - shield_id: ${env.SAFETY_MODEL} -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 2fc1b52d9..d537d0fce 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: tgi-inference @@ -19,7 +19,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -102,7 +102,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 8ad9725e3..230fcac2a 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -16,7 +16,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import TGIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -24,7 +24,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::tgi"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -52,7 +52,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -101,7 +101,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups, @@ -118,7 +118,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index c1461d75d..54b918eea 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: together @@ -20,7 +20,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -156,7 +156,7 @@ shields: provider_id: llama-guard-vision - shield_id: CodeScanner provider_id: code-scanner -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 135b124e4..2c0475796 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- vector_io - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: together @@ -145,6 +145,7 @@ models: model_type: embedding shields: - shield_id: meta-llama/Llama-Guard-3-8B +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py index 1e2def3bd..ec64527d2 100644 --- a/llama_stack/templates/together/together.py +++ b/llama_stack/templates/together/together.py @@ -18,7 +18,7 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.together import TogetherImplConfig from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -27,7 +27,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::together"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -48,7 +48,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::together", config=TogetherImplConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -105,7 +105,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=default_models + [embedding_model], default_tool_groups=default_tool_groups, @@ -117,7 +117,7 @@ def get_distribution_template() -> DistributionTemplate: inference_provider, embedding_provider, ], - "memory": [memory_provider], + "vector_io": [vector_io_provider], "safety": [ Provider( provider_id="llama-guard", diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml index e8a1693d0..45f543071 100644 --- a/llama_stack/templates/vllm-gpu/build.yaml +++ b/llama_stack/templates/vllm-gpu/build.yaml @@ -4,7 +4,7 @@ distribution_spec: providers: inference: - inline::vllm - memory: + vector_io: - inline::faiss - remote::chromadb - remote::pgvector diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index cc0ff047f..2d9ec6a3f 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -5,11 +5,11 @@ apis: - datasetio - eval - inference -- memory - safety - scoring - telemetry - tool_runtime +- vector_io providers: inference: - provider_id: vllm @@ -23,7 +23,7 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} - memory: + vector_io: - provider_id: faiss provider_type: inline::faiss config: @@ -106,7 +106,7 @@ models: provider_id: sentence-transformers model_type: embedding shields: [] -memory_banks: [] +vector_dbs: [] datasets: [] scoring_fns: [] eval_tasks: [] diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 71b24482d..a8f13ce40 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -10,7 +10,7 @@ from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) from llama_stack.providers.inline.inference.vllm import VLLMConfig -from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig +from llama_stack.providers.inline.vector_io.faiss.config import FaissImplConfig from llama_stack.templates.template import ( DistributionTemplate, RunConfigSettings, @@ -21,7 +21,7 @@ from llama_stack.templates.template import ( def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["inline::vllm"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -43,7 +43,7 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::vllm", config=VLLMConfig.sample_run_config(), ) - memory_provider = Provider( + vector_io_provider = Provider( provider_id="faiss", provider_type="inline::faiss", config=FaissImplConfig.sample_run_config(f"distributions/{name}"), @@ -93,7 +93,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider, embedding_provider], - "memory": [memory_provider], + "vector_io": [vector_io_provider], }, default_models=[inference_model, embedding_model], default_tool_groups=default_tool_groups,