Update more distribution docs to be simpler and partially codegen'ed

2025-12-07 10:50:56 +00:00 · 2024-11-20 14:44:04 -08:00 · 2024-11-20 14:44:04 -08:00 · 2411a44833
commit 2411a44833
parent e84d4436b5
51 changed files with 1188 additions and 291 deletions
--- a/llama_stack/templates/vllm-gpu/init.py
+++ b/llama_stack/templates/vllm-gpu/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .vllm import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/vllm-gpu/build.yaml
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@ -0,0 +1,19 @@
+version: '2'
+name: vllm-gpu
+distribution_spec:
+  description: Use a built-in vLLM engine for running LLM inference
+  docker_image: null
+  providers:
+    inference:
+    - inline::vllm
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -0,0 +1,58 @@
+version: '2'
+image_name: vllm-gpu
+docker_image: null
+conda_env: vllm-gpu
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: inline::vllm
+    config:
+      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
+      tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      enforce_eager: ${env.ENFORCE_EAGER:False}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/llama_stack/templates/vllm-gpu/vllm.py
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import ModelInput, Provider
+from llama_stack.providers.inline.inference.vllm import VLLMConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["inline::vllm"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="vllm",
+        provider_type="inline::vllm",
+        config=VLLMConfig.sample_run_config(),
+    )
+
+    inference_model = ModelInput(
+        model_id="${env.INFERENCE_MODEL}",
+        provider_id="vllm",
+    )
+
+    return DistributionTemplate(
+        name="vllm-gpu",
+        distro_type="self_hosted",
+        description="Use a built-in vLLM engine for running LLM inference",
+        docker_image=None,
+        template_path=None,
+        providers=providers,
+        default_models=[inference_model],
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=[inference_model],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "INFERENCE_MODEL": (
+                "meta-llama/Llama-3.2-3B-Instruct",
+                "Inference model loaded into the vLLM engine",
+            ),
+            "TENSOR_PARALLEL_SIZE": (
+                "1",
+                "Number of tensor parallel replicas (number of GPUs to use).",
+            ),
+            "MAX_TOKENS": (
+                "4096",
+                "Maximum number of tokens to generate.",
+            ),
+            "ENFORCE_EAGER": (
+                "False",
+                "Whether to use eager mode for inference (otherwise cuda graphs are used).",
+            ),
+            "GPU_MEMORY_UTILIZATION": (
+                "0.7",
+                "GPU memory utilization for the vLLM engine.",
+            ),
+        },
+    )