not working grafana

2025-10-23 08:33:09 +00:00 · 2025-07-31 21:18:12 -07:00 · 2025-07-31 21:18:12 -07:00 · a2bbb17fdd
commit a2bbb17fdd
parent 2589bf962e
6 changed files with 2424 additions and 0 deletions
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -88,6 +88,19 @@ if [ "$USE_EBS" = "true" ]; then
  envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -

+
+  # Create monitoring namespace
+  kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
+
+  # Install Prometheus Operator CRDs
+  ./install-prometheus-operator.sh
+
+  # Apply RBAC for Prometheus
+  kubectl apply -f ./prometheus-rbac.yaml
+
+  # Apply monitoring resources after CRDs are installed
+  envsubst < ./monitoring-k8s.yaml.template | kubectl apply -f -
+
  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
    --dry-run=client -o yaml > stack-configmap.yaml

@ -105,6 +118,16 @@ else
  envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -

+  # Create monitoring namespace
+  kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
+
+
+  # Apply RBAC for Prometheus
+  kubectl apply -f ./prometheus-rbac.yaml
+
+  # Apply monitoring resources after CRDs are installed
+  envsubst < ./monitoring-k8s.yaml.template | kubectl apply -f -
+
  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
    --dry-run=client -o yaml > stack-configmap.yaml

--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -37,6 +37,14 @@ set -x
 # Delete UI deployment
 envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

+# Delete monitoring resources
+envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
+# Delete Prometheus RBAC resources
+kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true
+
+
+
 # Delete ingress
 envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -17,6 +17,11 @@ spec:
    metadata:
      labels:
        app: llm-nim-code
+        nim-type: llama-nim
+      annotations:
+        prometheus.io/scrape: 'true'
+        prometheus.io/port: '8000'
+        prometheus.io/path: '/v1/metrics'
    spec:
      imagePullSecrets:
        - name: ngc-docker-registry          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
@ -42,6 +47,12 @@ spec:
                secretKeyRef:
                  name: ngc-api
                  key: NGC_API_KEY
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: "all"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+            - name: ENABLE_GPU_METRICS
+              value: "true"
          volumeMounts:
            - name: model-cache
              mountPath: /models       # default NIM cache path
--- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template
+++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template
--- a/docs/source/distributions/k8s/prometheus-rbac.yaml
+++ b/docs/source/distributions/k8s/prometheus-rbac.yaml
@ -0,0 +1,41 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/proxy
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups:
+  - extensions
+  - networking.k8s.io
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+- kind: ServiceAccount
+  name: prometheus
+  namespace: monitoring
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -23,7 +23,12 @@ spec:
    metadata:
      labels:
        app.kubernetes.io/name: vllm
+        app: vllm
        workload-type: inference
+      annotations:
+        prometheus.io/scrape: 'true'
+        prometheus.io/port: '8001'
+        prometheus.io/path: '/metrics'
    spec:
      # Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster
      # If you have GPU nodes with a different label, you can uncomment and modify this section
@ -45,6 +50,7 @@ spec:
              key: token
        ports:
          - containerPort: 8001
+            name: http
        resources:
          limits:
            nvidia.com/gpu: 1
@ -69,4 +75,5 @@ spec:
  - protocol: TCP
    port: 8001
    targetPort: 8001
+    name: http
  type: ClusterIP