add NIM k8s solution

2025-10-23 08:33:09 +00:00 · 2025-07-29 09:01:21 -07:00 · 2025-07-29 09:01:21 -07:00 · 8c0f328cbc
commit 8c0f328cbc
parent 95d25ddfe2
4 changed files with 133 additions and 128 deletions
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+export POSTGRES_USER=llamastack
+export POSTGRES_DB=llamastack
+export POSTGRES_PASSWORD=llamastack
+
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+
+# Set USE_EBS to false if you don't have permission to use EKS EBS
+export USE_EBS=${USE_EBS:-false}
+
+# HF_TOKEN should be set by the user; base64 encode it for the secret
+if [ -n "${HF_TOKEN:-}" ]; then
+  export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
+fi
+
+set -euo pipefail
+set -x
+
+# Delete resources in reverse order of creation to handle dependencies properly
+
+# Delete UI deployment
+envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete ingress
+envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete stack deployment
+envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete configmap
+kubectl delete configmap llama-stack-config --ignore-not-found=true
+
+# Delete chroma deployment
+envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete postgres deployment
+envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete vllm-safety deployment
+envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete vllm deployment
+envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete the HF token secret if it exists
+if [ -n "${HF_TOKEN:-}" ]; then
+  envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
+fi
+
+echo "All LlamaStack Kubernetes resources have been deleted."
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -0,0 +1,73 @@
+# -------------------------------------------------
+# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
+# -------------------------------------------------
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-nano-nim
+  labels:
+    app: llama-nano-nim
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama-nano-nim
+  template:
+    metadata:
+      labels:
+        app: llama-nano-nim
+    spec:
+      imagePullSecrets:
+        - name: ngc-secret          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
+      volumes:
+        - name: model-cache
+          emptyDir:
+            medium: Memory          # tmpfs; omit or use "" to back by node disk
+            sizeLimit: 12Gi          # fits the 4 B model + tensors; adjust if needed
+      containers:
+        - name: nim
+          image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
+          ports:
+            - name: http-openai
+              containerPort: 8000
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          env:
+            - name: NIM_MODEL_NAME
+              value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
+            - name: NGC_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: ngc-api
+                  key: NGC_API_KEY
+          volumeMounts:
+            - name: model-cache
+              mountPath: /models       # default NIM cache path
+          readinessProbe:
+            httpGet:
+              path: /v1/models
+              port: http-openai
+            initialDelaySeconds: 20
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /v1/health
+              port: http-openai
+            initialDelaySeconds: 60
+            periodSeconds: 30
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-nano-nim
+spec:
+  selector:
+    app: llama-nano-nim
+  ports:
+    - name: http-openai
+      port: 8000
+      targetPort: 8000
+  type: ClusterIP
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -1,128 +0,0 @@
-apiVersion: v1
-data:
-  stack_run_config.yaml: |
-    version: '2'
-    image_name: kubernetes-demo
-    apis:
-    - agents
-    - inference
-    - safety
-    - telemetry
-    - tool_runtime
-    - vector_io
-    providers:
-      inference:
-      - provider_id: vllm-inference
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: sentence-transformers
-        provider_type: inline::sentence-transformers
-        config: {}
-      vector_io:
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-        provider_type: remote::chromadb
-        config:
-          url: ${env.CHROMADB_URL:=}
-      safety:
-      - provider_id: llama-guard
-        provider_type: inline::llama-guard
-        config:
-          excluded_categories: []
-      agents:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          persistence_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-          responses_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      telemetry:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
-      tool_runtime:
-      - provider_id: brave-search
-        provider_type: remote::brave-search
-        config:
-          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: tavily-search
-        provider_type: remote::tavily-search
-        config:
-          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: rag-runtime
-        provider_type: inline::rag-runtime
-        config: {}
-      - provider_id: model-context-protocol
-        provider_type: remote::model-context-protocol
-        config: {}
-    metadata_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: llamastack_kvstore
-    inference_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-    models:
-    - metadata:
-        embedding_dimension: 384
-      model_id: all-MiniLM-L6-v2
-      provider_id: sentence-transformers
-      model_type: embedding
-    - metadata: {}
-      model_id: ${env.INFERENCE_MODEL}
-      provider_id: vllm-inference
-      model_type: llm
-    - metadata: {}
-      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-      provider_id: vllm-safety
-      model_type: llm
-    shields:
-    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    vector_dbs: []
-    datasets: []
-    scoring_fns: []
-    benchmarks: []
-    tool_groups:
-    - toolgroup_id: builtin::websearch
-      provider_id: tavily-search
-    - toolgroup_id: builtin::rag
-      provider_id: rag-runtime
-    server:
-      port: 8321
-kind: ConfigMap
-metadata:
-  creationTimestamp: null
-  name: llama-stack-config
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -45,6 +45,8 @@ spec:
          value: http://vllm-server.default.svc.cluster.local:8000/v1
        - name: VLLM_MAX_TOKENS
          value: "3072"
+        - name: NVIDIA_BASE_URL
+          value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: POSTGRES_HOST