From 645e55a450eab1c5ba36ca72345ad3754068fc76 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Sat, 2 Aug 2025 13:16:35 -0700 Subject: [PATCH] second checkpoint --- .../distributions/k8s/install-prometheus.sh | 38 +++++++++++++++++++ .../k8s/monitoring-k8s.yaml.template | 19 +++++----- .../k8s/ollama-safety-k8s.yaml.template | 2 +- .../distributions/k8s/stack-k8s.yaml.template | 36 ++++++++++++++---- 4 files changed, 78 insertions(+), 17 deletions(-) create mode 100644 docs/source/distributions/k8s/install-prometheus.sh diff --git a/docs/source/distributions/k8s/install-prometheus.sh b/docs/source/distributions/k8s/install-prometheus.sh new file mode 100644 index 000000000..5ac40de0a --- /dev/null +++ b/docs/source/distributions/k8s/install-prometheus.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Script to install prometheus-community/kube-prometheus-stack using Helm + +# Exit immediately if a command exits with a non-zero status +set -e + +# Add the Prometheus community Helm repository if it doesn't exist +if ! helm repo list | grep -q "prometheus-community"; then + echo "Adding prometheus-community Helm repository..." + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +fi + +# Update Helm repositories +echo "Updating Helm repositories..." +helm repo update + +# Create namespace for monitoring if it doesn't exist +if ! kubectl get namespace monitoring &> /dev/null; then + echo "Creating monitoring namespace..." + kubectl create namespace monitoring +fi + +# Install kube-prometheus-stack +echo "Installing kube-prometheus-stack..." +helm install prometheus prometheus-community/kube-prometheus-stack \ + --namespace monitoring \ + --set grafana.enabled=true \ + --set prometheus.enabled=true \ + --set alertmanager.enabled=true \ + --set prometheus.service.type=ClusterIP \ + --set grafana.service.type=ClusterIP \ + --set alertmanager.service.type=ClusterIP \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false + +echo "kube-prometheus-stack has been installed successfully!" +echo "To access Grafana UI, run: kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80" +echo "Default Grafana credentials - Username: admin, Password: prom-operator" \ No newline at end of file diff --git a/docs/source/distributions/k8s/monitoring-k8s.yaml.template b/docs/source/distributions/k8s/monitoring-k8s.yaml.template index c9e7dd759..1cd4059f1 100644 --- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template +++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template @@ -3062,16 +3062,15 @@ spec: port: number: 9090 --- -# NVIDIA DCGM Exporter Deployment for GPU metrics +# NVIDIA DCGM Exporter DaemonSet for GPU metrics apiVersion: apps/v1 -kind: Deployment +kind: DaemonSet metadata: name: dcgm-exporter namespace: monitoring labels: app: dcgm-exporter spec: - replicas: 1 selector: matchLabels: app: dcgm-exporter @@ -3082,7 +3081,7 @@ spec: spec: containers: - name: dcgm-exporter - image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04 + image: nvidia/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04 securityContext: runAsNonRoot: false runAsUser: 0 @@ -3093,17 +3092,19 @@ spec: - -f - /etc/dcgm-exporter/dcp-metrics-included.csv volumeMounts: - - name: device-metrics - mountPath: /dev/metrics - name: dcgm-config mountPath: /etc/dcgm-exporter volumes: - - name: device-metrics - hostPath: - path: /dev/metrics - name: dcgm-config configMap: name: dcgm-config + nodeSelector: + kubernetes.io/os: linux + nvidia.com/gpu.present: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule --- # DCGM Exporter ConfigMap for metrics configuration apiVersion: v1 diff --git a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template index 6519977e8..7e7d6ec60 100644 --- a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template +++ b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template @@ -63,7 +63,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: ollama-server-safety + name: ollama-safety spec: selector: app.kubernetes.io/name: ollama-safety diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 8e1569e3a..2210bba2e 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -26,12 +26,34 @@ spec: app.kubernetes.io/component: server spec: initContainers: - - name: wait-for-vllm-server + - name: wait-for-services image: busybox:1.28 - command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8001; sleep 2; done;'] - - name: wait-for-llm-nim-code - image: busybox:1.28 - command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8000; sleep 2; done;'] + command: ['sh', '-c', ' + echo "Waiting for all required services to be ready..."; + + echo "Checking vllm-server..."; + until nc -z vllm-server.default.svc.cluster.local 8001; do + echo "waiting for vllm-server on port 8001"; + sleep 2; + done; + echo "vllm-server is ready!"; + + echo "Checking llm-nim-code..."; + until nc -z llm-nim-code.default.svc.cluster.local 8000; do + echo "waiting for llm-nim-code on port 8000"; + sleep 2; + done; + echo "llm-nim-code is ready!"; + + echo "Checking ollama-safety..."; + until nc -z ollama-safety.default.svc.cluster.local 11434; do + echo "waiting for ollama-safety on port 11434"; + sleep 2; + done; + echo "ollama-safety is ready!"; + + echo "All services are ready!"; + '] containers: - name: llama-stack image: llamastack/distribution-starter:0.2.15 @@ -56,7 +78,7 @@ spec: - name: NVIDIA_BASE_URL value: http://llm-nim-code.default.svc.cluster.local:8000 - name: OLLAMA_BASE_URL - value: http://ollama-safety.default.svc.cluster.local:8000 + value: http://ollama-safety.default.svc.cluster.local:11434 - name: POSTGRES_HOST value: postgres-server.default.svc.cluster.local - name: POSTGRES_PORT @@ -69,7 +91,7 @@ spec: value: "${CODE_MODEL}" - name: TAVILY_SEARCH_API_KEY value: "${TAVILY_SEARCH_API_KEY}" - - name: OLLAMA_MODLE + - name: OLLAMA_MODEL value: "${OLLAMA_MODEL}" command: ["/bin/sh"] args: