diff --git a/distributions/inline-nvidia/build.yaml b/distributions/inline-nvidia/build.yaml new file mode 120000 index 000000000..8903d2e57 --- /dev/null +++ b/distributions/inline-nvidia/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/nvidia/build.yaml \ No newline at end of file diff --git a/distributions/inline-nvidia/compose.yaml b/distributions/inline-nvidia/compose.yaml new file mode 100644 index 000000000..f7320b968 --- /dev/null +++ b/distributions/inline-nvidia/compose.yaml @@ -0,0 +1,58 @@ +services: + nim: + image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest + network_mode: "host" + volumes: + - nim-llm-cache:/opt/nim/.cache + ports: + - "8000:8000" + shm_size: 16G + environment: + - CUDA_VISIBLE_DEVICES=0 + - NIM_HTTP_API_PORT=8000 + - NIM_TRITON_LOG_VERBOSE=1 + - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}} + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + healthcheck: + test: ["CMD", "curl", "http://localhost:8000/v1/health/ready"] + interval: 5s + timeout: 5s + retries: 30 + start_period: 120s + llamastack: + depends_on: + - nim + image: distribution-nvidia:dev + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/llamastack-run-nvidia.yaml + ports: + - "5000:5000" + environment: + - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} + - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s +volumes: + nim-llm-cache: + driver: local \ No newline at end of file diff --git a/distributions/inline-nvidia/run.yaml b/distributions/inline-nvidia/run.yaml new file mode 100644 index 000000000..81e9e7f1c --- /dev/null +++ b/distributions/inline-nvidia/run.yaml @@ -0,0 +1,56 @@ +version: '2' +image_name: nvidia +docker_image: null +conda_env: nvidia +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: nvidia + provider_type: remote::nvidia + config: + url: http://localhost:8000 + api_key: ${env.NVIDIA_API_KEY} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: nvidia + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] + diff --git a/distributions/remote-nvidia/build.yaml b/distributions/remote-nvidia/build.yaml new file mode 120000 index 000000000..8903d2e57 --- /dev/null +++ b/distributions/remote-nvidia/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/nvidia/build.yaml \ No newline at end of file diff --git a/distributions/remote-nvidia/compose.yaml b/distributions/remote-nvidia/compose.yaml new file mode 100644 index 000000000..04b12d0da --- /dev/null +++ b/distributions/remote-nvidia/compose.yaml @@ -0,0 +1,19 @@ +services: + llamastack: + image: distribution-nvidia:dev + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/llamastack-run-nvidia.yaml + ports: + - "5000:5000" + environment: + - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} + - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/remote-nvidia/run.yaml b/distributions/remote-nvidia/run.yaml new file mode 120000 index 000000000..85da3e26b --- /dev/null +++ b/distributions/remote-nvidia/run.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/nvidia/run.yaml \ No newline at end of file