From 645e55a450eab1c5ba36ca72345ad3754068fc76 Mon Sep 17 00:00:00 2001
From: Kai Wu <kaiwu@meta.com>
Date: Sat, 2 Aug 2025 13:16:35 -0700
Subject: [PATCH] second checkpoint

---
 .../distributions/k8s/install-prometheus.sh   | 38 +++++++++++++++++++
 .../k8s/monitoring-k8s.yaml.template          | 19 +++++-----
 .../k8s/ollama-safety-k8s.yaml.template       |  2 +-
 .../distributions/k8s/stack-k8s.yaml.template | 36 ++++++++++++++----
 4 files changed, 78 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/distributions/k8s/install-prometheus.sh

diff --git a/docs/source/distributions/k8s/install-prometheus.sh b/docs/source/distributions/k8s/install-prometheus.sh
new file mode 100644
index 000000000..5ac40de0a
--- /dev/null
+++ b/docs/source/distributions/k8s/install-prometheus.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Script to install prometheus-community/kube-prometheus-stack using Helm
+
+# Exit immediately if a command exits with a non-zero status
+set -e
+
+# Add the Prometheus community Helm repository if it doesn't exist
+if ! helm repo list | grep -q "prometheus-community"; then
+  echo "Adding prometheus-community Helm repository..."
+  helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+fi
+
+# Update Helm repositories
+echo "Updating Helm repositories..."
+helm repo update
+
+# Create namespace for monitoring if it doesn't exist
+if ! kubectl get namespace monitoring &> /dev/null; then
+  echo "Creating monitoring namespace..."
+  kubectl create namespace monitoring
+fi
+
+# Install kube-prometheus-stack
+echo "Installing kube-prometheus-stack..."
+helm install prometheus prometheus-community/kube-prometheus-stack \
+  --namespace monitoring \
+  --set grafana.enabled=true \
+  --set prometheus.enabled=true \
+  --set alertmanager.enabled=true \
+  --set prometheus.service.type=ClusterIP \
+  --set grafana.service.type=ClusterIP \
+  --set alertmanager.service.type=ClusterIP \
+  --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
+  --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false
+
+echo "kube-prometheus-stack has been installed successfully!"
+echo "To access Grafana UI, run: kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
+echo "Default Grafana credentials - Username: admin, Password: prom-operator"
\ No newline at end of file
diff --git a/docs/source/distributions/k8s/monitoring-k8s.yaml.template b/docs/source/distributions/k8s/monitoring-k8s.yaml.template
index c9e7dd759..1cd4059f1 100644
--- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template
+++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template
@@ -3062,16 +3062,15 @@ spec:
             port:
               number: 9090
 ---
-# NVIDIA DCGM Exporter Deployment for GPU metrics
+# NVIDIA DCGM Exporter DaemonSet for GPU metrics
 apiVersion: apps/v1
-kind: Deployment
+kind: DaemonSet
 metadata:
   name: dcgm-exporter
   namespace: monitoring
   labels:
     app: dcgm-exporter
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: dcgm-exporter
@@ -3082,7 +3081,7 @@ spec:
     spec:
       containers:
       - name: dcgm-exporter
-        image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04
+        image: nvidia/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04
         securityContext:
           runAsNonRoot: false
           runAsUser: 0
@@ -3093,17 +3092,19 @@ spec:
         - -f
         - /etc/dcgm-exporter/dcp-metrics-included.csv
         volumeMounts:
-        - name: device-metrics
-          mountPath: /dev/metrics
         - name: dcgm-config
           mountPath: /etc/dcgm-exporter
       volumes:
-      - name: device-metrics
-        hostPath:
-          path: /dev/metrics
       - name: dcgm-config
         configMap:
           name: dcgm-config
+      nodeSelector:
+        kubernetes.io/os: linux
+        nvidia.com/gpu.present: "true"
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
 ---
 # DCGM Exporter ConfigMap for metrics configuration
 apiVersion: v1
diff --git a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
index 6519977e8..7e7d6ec60 100644
--- a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
@@ -63,7 +63,7 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: ollama-server-safety
+  name: ollama-safety
 spec:
   selector:
     app.kubernetes.io/name: ollama-safety
diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template
index 8e1569e3a..2210bba2e 100644
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@@ -26,12 +26,34 @@ spec:
         app.kubernetes.io/component: server
     spec:
       initContainers:
-      - name: wait-for-vllm-server
+      - name: wait-for-services
         image: busybox:1.28
-        command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8001; sleep 2; done;']
-      - name: wait-for-llm-nim-code
-        image: busybox:1.28
-        command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8000; sleep 2; done;']
+        command: ['sh', '-c', '
+          echo "Waiting for all required services to be ready...";
+
+          echo "Checking vllm-server...";
+          until nc -z vllm-server.default.svc.cluster.local 8001; do
+            echo "waiting for vllm-server on port 8001";
+            sleep 2;
+          done;
+          echo "vllm-server is ready!";
+
+          echo "Checking llm-nim-code...";
+          until nc -z llm-nim-code.default.svc.cluster.local 8000; do
+            echo "waiting for llm-nim-code on port 8000";
+            sleep 2;
+          done;
+          echo "llm-nim-code is ready!";
+
+          echo "Checking ollama-safety...";
+          until nc -z ollama-safety.default.svc.cluster.local 11434; do
+            echo "waiting for ollama-safety on port 11434";
+            sleep 2;
+          done;
+          echo "ollama-safety is ready!";
+
+          echo "All services are ready!";
+        ']
       containers:
       - name: llama-stack
         image: llamastack/distribution-starter:0.2.15
@@ -56,7 +78,7 @@ spec:
         - name: NVIDIA_BASE_URL
           value: http://llm-nim-code.default.svc.cluster.local:8000
         - name: OLLAMA_BASE_URL
-          value: http://ollama-safety.default.svc.cluster.local:8000
+          value: http://ollama-safety.default.svc.cluster.local:11434
         - name: POSTGRES_HOST
           value: postgres-server.default.svc.cluster.local
         - name: POSTGRES_PORT
@@ -69,7 +91,7 @@ spec:
           value: "${CODE_MODEL}"
         - name: TAVILY_SEARCH_API_KEY
           value: "${TAVILY_SEARCH_API_KEY}"
-        - name: OLLAMA_MODLE
+        - name: OLLAMA_MODEL
           value: "${OLLAMA_MODEL}"
         command: ["/bin/sh"]
         args: