not working grafana

This commit is contained in:
Kai Wu 2025-07-31 21:18:12 -07:00
parent 2589bf962e
commit a2bbb17fdd
6 changed files with 2424 additions and 0 deletions

View file

@ -88,6 +88,19 @@ if [ "$USE_EBS" = "true" ]; then
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
# Create monitoring namespace
kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
# Install Prometheus Operator CRDs
./install-prometheus-operator.sh
# Apply RBAC for Prometheus
kubectl apply -f ./prometheus-rbac.yaml
# Apply monitoring resources after CRDs are installed
envsubst < ./monitoring-k8s.yaml.template | kubectl apply -f -
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml
@ -105,6 +118,16 @@ else
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
# Create monitoring namespace
kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
# Apply RBAC for Prometheus
kubectl apply -f ./prometheus-rbac.yaml
# Apply monitoring resources after CRDs are installed
envsubst < ./monitoring-k8s.yaml.template | kubectl apply -f -
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml

View file

@ -37,6 +37,14 @@ set -x
# Delete UI deployment
envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete monitoring resources
envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete Prometheus RBAC resources
kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true
# Delete ingress
envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

View file

@ -17,6 +17,11 @@ spec:
metadata:
labels:
app: llm-nim-code
nim-type: llama-nim
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8000'
prometheus.io/path: '/v1/metrics'
spec:
imagePullSecrets:
- name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
@ -42,6 +47,12 @@ spec:
secretKeyRef:
name: ngc-api
key: NGC_API_KEY
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
- name: ENABLE_GPU_METRICS
value: "true"
volumeMounts:
- name: model-cache
mountPath: /models # default NIM cache path

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,41 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring

View file

@ -23,7 +23,12 @@ spec:
metadata:
labels:
app.kubernetes.io/name: vllm
app: vllm
workload-type: inference
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8001'
prometheus.io/path: '/metrics'
spec:
# Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster
# If you have GPU nodes with a different label, you can uncomment and modify this section
@ -45,6 +50,7 @@ spec:
key: token
ports:
- containerPort: 8001
name: http
resources:
limits:
nvidia.com/gpu: 1
@ -69,4 +75,5 @@ spec:
- protocol: TCP
port: 8001
targetPort: 8001
name: http
type: ClusterIP