Added distributions for inline and remote

2025-12-16 13:29:25 +00:00 · 2024-11-20 23:04:48 +00:00 · 2024-11-20 23:04:48 +00:00 · 6759744235
commit 6759744235
parent a5d413045c
6 changed files with 136 additions and 0 deletions
--- a/distributions/inline-nvidia/build.yaml
+++ b/distributions/inline-nvidia/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/inline-nvidia/compose.yaml
+++ b/distributions/inline-nvidia/compose.yaml
@ -0,0 +1,58 @@
+services:
+  nim:
+    image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest
+    network_mode: "host"
+    volumes:
+    - nim-llm-cache:/opt/nim/.cache
+    ports:
+      - "8000:8000"
+    shm_size: 16G
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
+    command: []
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "http://localhost:8000/v1/health/ready"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+      start_period: 120s
+  llamastack:
+    depends_on:
+    - nim
+    image: distribution-nvidia:dev
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-nvidia.yaml
+    ports:
+      - "5000:5000"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
+volumes:
+  nim-llm-cache:
+    driver: local
--- a/distributions/inline-nvidia/run.yaml
+++ b/distributions/inline-nvidia/run.yaml
@ -0,0 +1,56 @@
+version: '2'
+image_name: nvidia
+docker_image: null
+conda_env: nvidia
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: nvidia
+    provider_type: remote::nvidia
+    config:
+      url: http://localhost:8000
+      api_key: ${env.NVIDIA_API_KEY}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: nvidia
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
+
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -0,0 +1,19 @@
+services:
+  llamastack:
+    image: distribution-nvidia:dev
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-nvidia.yaml
+    ports:
+      - "5000:5000"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/run.yaml
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/nvidia/build.yaml`