llama-stack/distributions/inline-nvidia/compose.yaml

services:
  nim:
    image: ${DOCKER_IMAGE:-nvcr.io/nim/meta/llama-3.1-8b-instruct:latest}
    network_mode: "host"
    volumes:
    - nim-llm-cache:/opt/nim/.cache
    ports:
      - "8000:8000"
    shm_size: 16G
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - NIM_HTTP_API_PORT=8000
      - NIM_TRITON_LOG_VERBOSE=1
      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "http://localhost:8000/v1/health/ready"]
      interval: 5s
      timeout: 5s
      retries: 30
      start_period: 120s
  llamastack:
    depends_on:
    - nim
    image: distribution-nvidia:dev
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-nvidia.yaml
    ports:
      - "5000:5000"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
volumes:
  nim-llm-cache:
    driver: local