second checkpoint

This commit is contained in:
Kai Wu 2025-08-02 13:16:35 -07:00
parent 67f19f76b2
commit 645e55a450
4 changed files with 78 additions and 17 deletions

View file

@ -0,0 +1,38 @@
#!/bin/bash
# Script to install prometheus-community/kube-prometheus-stack using Helm
# Exit immediately if a command exits with a non-zero status
set -e
# Add the Prometheus community Helm repository if it doesn't exist
if ! helm repo list | grep -q "prometheus-community"; then
echo "Adding prometheus-community Helm repository..."
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
fi
# Update Helm repositories
echo "Updating Helm repositories..."
helm repo update
# Create namespace for monitoring if it doesn't exist
if ! kubectl get namespace monitoring &> /dev/null; then
echo "Creating monitoring namespace..."
kubectl create namespace monitoring
fi
# Install kube-prometheus-stack
echo "Installing kube-prometheus-stack..."
helm install prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--set grafana.enabled=true \
--set prometheus.enabled=true \
--set alertmanager.enabled=true \
--set prometheus.service.type=ClusterIP \
--set grafana.service.type=ClusterIP \
--set alertmanager.service.type=ClusterIP \
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false
echo "kube-prometheus-stack has been installed successfully!"
echo "To access Grafana UI, run: kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80"
echo "Default Grafana credentials - Username: admin, Password: prom-operator"

View file

@ -3062,16 +3062,15 @@ spec:
port: port:
number: 9090 number: 9090
--- ---
# NVIDIA DCGM Exporter Deployment for GPU metrics # NVIDIA DCGM Exporter DaemonSet for GPU metrics
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: DaemonSet
metadata: metadata:
name: dcgm-exporter name: dcgm-exporter
namespace: monitoring namespace: monitoring
labels: labels:
app: dcgm-exporter app: dcgm-exporter
spec: spec:
replicas: 1
selector: selector:
matchLabels: matchLabels:
app: dcgm-exporter app: dcgm-exporter
@ -3082,7 +3081,7 @@ spec:
spec: spec:
containers: containers:
- name: dcgm-exporter - name: dcgm-exporter
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04 image: nvidia/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04
securityContext: securityContext:
runAsNonRoot: false runAsNonRoot: false
runAsUser: 0 runAsUser: 0
@ -3093,17 +3092,19 @@ spec:
- -f - -f
- /etc/dcgm-exporter/dcp-metrics-included.csv - /etc/dcgm-exporter/dcp-metrics-included.csv
volumeMounts: volumeMounts:
- name: device-metrics
mountPath: /dev/metrics
- name: dcgm-config - name: dcgm-config
mountPath: /etc/dcgm-exporter mountPath: /etc/dcgm-exporter
volumes: volumes:
- name: device-metrics
hostPath:
path: /dev/metrics
- name: dcgm-config - name: dcgm-config
configMap: configMap:
name: dcgm-config name: dcgm-config
nodeSelector:
kubernetes.io/os: linux
nvidia.com/gpu.present: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
--- ---
# DCGM Exporter ConfigMap for metrics configuration # DCGM Exporter ConfigMap for metrics configuration
apiVersion: v1 apiVersion: v1

View file

@ -63,7 +63,7 @@ spec:
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
metadata: metadata:
name: ollama-server-safety name: ollama-safety
spec: spec:
selector: selector:
app.kubernetes.io/name: ollama-safety app.kubernetes.io/name: ollama-safety

View file

@ -26,12 +26,34 @@ spec:
app.kubernetes.io/component: server app.kubernetes.io/component: server
spec: spec:
initContainers: initContainers:
- name: wait-for-vllm-server - name: wait-for-services
image: busybox:1.28 image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8001; sleep 2; done;'] command: ['sh', '-c', '
- name: wait-for-llm-nim-code echo "Waiting for all required services to be ready...";
image: busybox:1.28
command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8000; sleep 2; done;'] echo "Checking vllm-server...";
until nc -z vllm-server.default.svc.cluster.local 8001; do
echo "waiting for vllm-server on port 8001";
sleep 2;
done;
echo "vllm-server is ready!";
echo "Checking llm-nim-code...";
until nc -z llm-nim-code.default.svc.cluster.local 8000; do
echo "waiting for llm-nim-code on port 8000";
sleep 2;
done;
echo "llm-nim-code is ready!";
echo "Checking ollama-safety...";
until nc -z ollama-safety.default.svc.cluster.local 11434; do
echo "waiting for ollama-safety on port 11434";
sleep 2;
done;
echo "ollama-safety is ready!";
echo "All services are ready!";
']
containers: containers:
- name: llama-stack - name: llama-stack
image: llamastack/distribution-starter:0.2.15 image: llamastack/distribution-starter:0.2.15
@ -56,7 +78,7 @@ spec:
- name: NVIDIA_BASE_URL - name: NVIDIA_BASE_URL
value: http://llm-nim-code.default.svc.cluster.local:8000 value: http://llm-nim-code.default.svc.cluster.local:8000
- name: OLLAMA_BASE_URL - name: OLLAMA_BASE_URL
value: http://ollama-safety.default.svc.cluster.local:8000 value: http://ollama-safety.default.svc.cluster.local:11434
- name: POSTGRES_HOST - name: POSTGRES_HOST
value: postgres-server.default.svc.cluster.local value: postgres-server.default.svc.cluster.local
- name: POSTGRES_PORT - name: POSTGRES_PORT
@ -69,7 +91,7 @@ spec:
value: "${CODE_MODEL}" value: "${CODE_MODEL}"
- name: TAVILY_SEARCH_API_KEY - name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}" value: "${TAVILY_SEARCH_API_KEY}"
- name: OLLAMA_MODLE - name: OLLAMA_MODEL
value: "${OLLAMA_MODEL}" value: "${OLLAMA_MODEL}"
command: ["/bin/sh"] command: ["/bin/sh"]
args: args: