add NIM k8s solution

2025-10-24 08:47:26 +00:00 · 2025-07-29 09:01:21 -07:00 · 2025-07-29 09:01:21 -07:00 · 8c0f328cbc
commit 8c0f328cbc
parent 95d25ddfe2
4 changed files with 133 additions and 128 deletions
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -0,0 +1,73 @@
+# -------------------------------------------------
+# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
+# -------------------------------------------------
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-nano-nim
+  labels:
+    app: llama-nano-nim
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama-nano-nim
+  template:
+    metadata:
+      labels:
+        app: llama-nano-nim
+    spec:
+      imagePullSecrets:
+        - name: ngc-secret          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
+      volumes:
+        - name: model-cache
+          emptyDir:
+            medium: Memory          # tmpfs; omit or use "" to back by node disk
+            sizeLimit: 12Gi          # fits the 4 B model + tensors; adjust if needed
+      containers:
+        - name: nim
+          image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
+          ports:
+            - name: http-openai
+              containerPort: 8000
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          env:
+            - name: NIM_MODEL_NAME
+              value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
+            - name: NGC_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: ngc-api
+                  key: NGC_API_KEY
+          volumeMounts:
+            - name: model-cache
+              mountPath: /models       # default NIM cache path
+          readinessProbe:
+            httpGet:
+              path: /v1/models
+              port: http-openai
+            initialDelaySeconds: 20
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /v1/health
+              port: http-openai
+            initialDelaySeconds: 60
+            periodSeconds: 30
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-nano-nim
+spec:
+  selector:
+    app: llama-nano-nim
+  ports:
+    - name: http-openai
+      port: 8000
+      targetPort: 8000
+  type: ClusterIP