not working grafana

2025-08-02 08:44:44 +00:00 · 2025-07-31 21:18:12 -07:00 · 2025-07-31 21:18:12 -07:00 · a2bbb17fdd
commit a2bbb17fdd
parent 2589bf962e
6 changed files with 2424 additions and 0 deletions
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -88,6 +88,19 @@ if [ "$USE_EBS" = "true" ]; then
  envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
  # Create monitoring namespace
  kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
  # Install Prometheus Operator CRDs
  ./install-prometheus-operator.sh
  # Apply RBAC for Prometheus
  kubectl apply -f ./prometheus-rbac.yaml
  # Apply monitoring resources after CRDs are installed
  envsubst < ./monitoring-k8s.yaml.template | kubectl apply -f -
  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
    --dry-run=client -o yaml > stack-configmap.yaml
@ -105,6 +118,16 @@ else
  envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  # Create monitoring namespace
  kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f -
  # Apply RBAC for Prometheus
  kubectl apply -f ./prometheus-rbac.yaml
  # Apply monitoring resources after CRDs are installed
  envsubst < ./monitoring-k8s.yaml.template | kubectl apply -f -
  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
    --dry-run=client -o yaml > stack-configmap.yaml
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -37,6 +37,14 @@ set -x
 # Delete UI deployment
 envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete monitoring resources
 envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete Prometheus RBAC resources
 kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true
 # Delete ingress
 envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -17,6 +17,11 @@ spec:
    metadata:
      labels:
        app: llm-nim-code
        nim-type: llama-nim
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/port: '8000'
        prometheus.io/path: '/v1/metrics'
    spec:
      imagePullSecrets:
        - name: ngc-docker-registry          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
@ -42,6 +47,12 @@ spec:
                secretKeyRef:
                  name: ngc-api
                  key: NGC_API_KEY
            - name: NVIDIA_VISIBLE_DEVICES
              value: "all"
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: "compute,utility"
            - name: ENABLE_GPU_METRICS
              value: "true"
          volumeMounts:
            - name: model-cache
              mountPath: /models       # default NIM cache path
--- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template
+++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template
--- a/docs/source/distributions/k8s/prometheus-rbac.yaml
+++ b/docs/source/distributions/k8s/prometheus-rbac.yaml
@ -0,0 +1,41 @@
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: prometheus
  namespace: monitoring
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: prometheus
 rules:
 - apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 - apiGroups:
  - extensions
  - networking.k8s.io
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
 - nonResourceURLs: ["/metrics"]
  verbs: ["get"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: prometheus
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
 subjects:
 - kind: ServiceAccount
  name: prometheus
  namespace: monitoring
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -23,7 +23,12 @@ spec:
    metadata:
      labels:
        app.kubernetes.io/name: vllm
        app: vllm
        workload-type: inference
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/port: '8001'
        prometheus.io/path: '/metrics'
    spec:
      # Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster
      # If you have GPU nodes with a different label, you can uncomment and modify this section
@ -45,6 +50,7 @@ spec:
              key: token
        ports:
          - containerPort: 8001
            name: http
        resources:
          limits:
            nvidia.com/gpu: 1
@ -69,4 +75,5 @@ spec:
  - protocol: TCP
    port: 8001
    targetPort: 8001
    name: http
  type: ClusterIP