diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index ddf069e8d..ea188777f 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -29,12 +29,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db - - provider_id: chromadb - provider_type: remote::chromadb - config: {} - - provider_id: pgvector - provider_type: remote::pgvector - config: {} safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 27665efa3..10b448b5c 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -6,6 +6,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.inline.inference.vllm import VLLMConfig +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -20,12 +21,17 @@ def get_distribution_template() -> DistributionTemplate: "datasetio": ["remote::huggingface", "inline::localfs"], "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "vllm-gpu" inference_provider = Provider( provider_id="vllm", provider_type="inline::vllm", config=VLLMConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -33,7 +39,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="vllm-gpu", + name=name, distro_type="self_hosted", description="Use a built-in vLLM engine for running LLM inference", docker_image=None, @@ -44,6 +50,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ),