Added distributions for inline and remote

2025-12-16 21:22:38 +00:00 · 2024-11-20 23:04:48 +00:00 · 2024-11-20 23:04:48 +00:00 · 6759744235
commit 6759744235
parent a5d413045c
6 changed files with 136 additions and 0 deletions
--- a/distributions/inline-nvidia/build.yaml
+++ b/distributions/inline-nvidia/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/inline-nvidia/compose.yaml
+++ b/distributions/inline-nvidia/compose.yaml
@ -0,0 +1,58 @@
 services:
  nim:
    image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest
    network_mode: "host"
    volumes:
    - nim-llm-cache:/opt/nim/.cache
    ports:
      - "8000:8000"
    shm_size: 16G
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - NIM_HTTP_API_PORT=8000
      - NIM_TRITON_LOG_VERBOSE=1
      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "http://localhost:8000/v1/health/ready"]
      interval: 5s
      timeout: 5s
      retries: 30
      start_period: 120s
  llamastack:
    depends_on:
    - nim
    image: distribution-nvidia:dev
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-nvidia.yaml
    ports:
      - "5000:5000"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
 volumes:
  nim-llm-cache:
    driver: local
--- a/distributions/inline-nvidia/run.yaml
+++ b/distributions/inline-nvidia/run.yaml
@ -0,0 +1,56 @@
 version: '2'
 image_name: nvidia
 docker_image: null
 conda_env: nvidia
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
      url: http://localhost:8000
      api_key: ${env.NVIDIA_API_KEY}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: nvidia
  provider_model_id: null
 shields: []
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -0,0 +1,19 @@
 services:
  llamastack:
    image: distribution-nvidia:dev
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-nvidia.yaml
    ports:
      - "5000:5000"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/nvidia/run.yaml
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/nvidia/build.yaml`