temp checkpoint

2025-08-02 16:54:42 +00:00 · 2025-08-01 12:34:38 -07:00 · 2025-08-01 12:34:38 -07:00 · 67f19f76b2
commit 67f19f76b2
parent a2bbb17fdd
7 changed files with 1032 additions and 25 deletions
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -18,7 +18,7 @@ export POSTGRES_PASSWORD=llamastack
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CODE_MODEL=bigcode/starcoder2-7b
-
+export OLLAMA_MODEL=llama-guard3:1b
 # Set USE_EBS to false if you don't have permission to use EKS EBS
 export USE_EBS=${USE_EBS:-false}
 set -euo pipefail
@ -44,10 +44,6 @@ else
  exit 1
 fi
 if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
  echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
  exit 1
 fi
@ -84,6 +80,7 @@ echo "Secret verification successful. All required secrets are present."
 if [ "$USE_EBS" = "true" ]; then
  echo "Using EBS storage for persistent volumes"
  envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
  envsubst < ./ollama-safety-k8s.yaml.template | kubectl apply -f -
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
@ -114,6 +111,7 @@ else
  echo "Using emptyDir for storage (data will not persist across pod restarts)"
  # Process templates to replace EBS storage with emptyDir
  envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./ollama-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -29,48 +29,130 @@ if [ -n "${NGC_API_KEY:-}" ]; then
  export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
 fi
 # Define namespace - default to current namespace if not specified
 export NAMESPACE=${NAMESPACE:-$(kubectl config view --minify -o jsonpath='{..namespace}')}
 if [ -z "$NAMESPACE" ]; then
  export NAMESPACE="default"
 fi
 set -euo pipefail
 set -x
 # Delete resources in reverse order of creation to handle dependencies properly
-# Delete UI deployment
+echo "Starting comprehensive deletion of all LlamaStack resources..."
 # Delete UI deployment and service
 echo "Deleting UI resources..."
 envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Check for UI service template and delete if exists
 if [ -f "./ui-service-k8s.yaml.template" ]; then
  envsubst < ./ui-service-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 fi
 # Delete monitoring resources
 echo "Deleting monitoring resources..."
 envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete Prometheus RBAC resources
 echo "Deleting Prometheus RBAC resources..."
 kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true
 # Delete ingress
 echo "Deleting ingress resources..."
 envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete stack deployment
 echo "Deleting stack deployment..."
 envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
-# Delete configmap
+# Delete configmaps
 echo "Deleting configmaps..."
 kubectl delete configmap llama-stack-config --ignore-not-found=true
 # Check for stack configmap and delete if exists
 if [ -f "./stack-configmap.yaml" ]; then
  kubectl delete -f ./stack-configmap.yaml --ignore-not-found=true
 fi
 # Delete chroma deployment
 echo "Deleting chroma deployment..."
 envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete postgres deployment
 echo "Deleting postgres deployment..."
 envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
-# Delete vllm-safety deployment
+# Delete llama-nim deployment
 echo "Deleting llama-nim deployment..."
 envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete ollama-safety deployment
 echo "Deleting ollama-safety deployment..."
 envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete vllm deployment
 echo "Deleting vllm deployment..."
 envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete the HF token secret if it exists
 if [ -n "${HF_TOKEN:-}" ]; then
  echo "Deleting HF token secret..."
  envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
 fi
-# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
+# Delete any other template files that might exist
 echo "Checking for additional template files..."
 for template in ./*.yaml.template; do
  if [ -f "$template" ]; then
    # Skip templates we've already processed
    if [[ "$template" != "./ui-k8s.yaml.template" &&
          "$template" != "./monitoring-k8s.yaml.template" &&
          "$template" != "./ingress-k8s.yaml.template" &&
          "$template" != "./stack-k8s.yaml.template" &&
          "$template" != "./chroma-k8s.yaml.template" &&
          "$template" != "./postgres-k8s.yaml.template" &&
          "$template" != "./llama-nim.yaml.template" &&
          "$template" != "./vllm-safety-k8s.yaml.template" &&
          "$template" != "./ollama-safety-k8s.yaml.template" &&
          "$template" != "./vllm-k8s.yaml.template" &&
          "$template" != "./set-secret.yaml.template" &&
          "$template" != "./ui-service-k8s.yaml.template" ]]; then
      echo "Deleting resources from $template..."
      envsubst < "$template" | kubectl delete -f - --ignore-not-found=true
    fi
  fi
 done
-echo "All LlamaStack Kubernetes resources have been deleted."
+# Delete any PersistentVolumeClaims created by the stack
 echo "Deleting PersistentVolumeClaims..."
 kubectl delete pvc -l app=llama-stack --ignore-not-found=true
 kubectl delete pvc -l app=chroma --ignore-not-found=true
 kubectl delete pvc -l app=postgres --ignore-not-found=true
 kubectl delete pvc -l app=vllm --ignore-not-found=true
 kubectl delete pvc -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
 # Delete any remaining services
 echo "Deleting any remaining services..."
 kubectl delete service -l app=llama-stack --ignore-not-found=true
 kubectl delete service -l app=chroma --ignore-not-found=true
 kubectl delete service -l app=postgres --ignore-not-found=true
 kubectl delete service -l app=vllm --ignore-not-found=true
 kubectl delete service -l app=llama-nim --ignore-not-found=true
 kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
 # Delete any remaining secrets
 echo "Deleting any remaining secrets..."
 kubectl delete secret hf-secret --ignore-not-found=true
 kubectl delete secret ngc-secret --ignore-not-found=true
 kubectl delete secret -l app=llama-stack --ignore-not-found=true
 # Verify no resources remain
 echo "Verifying deletion..."
 REMAINING_RESOURCES=$(kubectl get all -l app=llama-stack 2>/dev/null)
 if [ -z "$REMAINING_RESOURCES" ]; then
  echo "All LlamaStack Kubernetes resources have been successfully deleted."
 else
  echo "Some LlamaStack resources may still exist. Please check manually with:"
  echo "kubectl get all -l app=llama-stack"
 fi
--- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template
+++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template
@ -18,7 +18,7 @@ data:
    scrape_configs:
      # NVIDIA DCGM exporter for GPU metrics
-      - job_name: 'nvidia-dcgm-exporter'
+      - job_name: 'dcgm'
        static_configs:
          - targets: ['dcgm-exporter:9400']
@ -98,6 +98,30 @@ data:
          - targets: ['llm-nim-code:8000']
        metrics_path: /v1/metrics
        scrape_interval: 5s
      # Specific job for Ollama pods
      - job_name: 'ollama-pods'
        kubernetes_sd_configs:
          - role: pod
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
            regex: ollama-safety
            action: keep
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
            target_label: __address__
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
 ---
 # Prometheus Deployment
 apiVersion: apps/v1
@ -262,9 +286,9 @@ data:
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
-          "targets": [
+        "targets": [
            {
-              "expr": "DCGM_FI_DEV_GPU_UTIL{pod=~\"llm-nim-code.*\"}",
+              "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -351,7 +375,7 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "DCGM_FI_DEV_FB_USED{pod=~\"llm-nim-code.*\"} / 1024 / 1024 / 1024",
+              "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"} / 1024 / 1024 / 1024",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -438,7 +462,7 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "DCGM_FI_DEV_POWER_USAGE{pod=~\"llm-nim-code.*\"}",
+              "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -525,7 +549,7 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "DCGM_FI_DEV_GPU_TEMP{pod=~\"llm-nim-code.*\"}",
+              "expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -954,6 +978,763 @@ data:
      "uid": "nim-dashboard",
      "version": 1
    }
  ollama-dashboard.json: |
    {
      "annotations": {
        "list": [
          {
            "builtIn": 1,
            "datasource": "-- Grafana --",
            "enable": true,
            "hide": true,
            "iconColor": "rgba(0, 211, 255, 1)",
            "name": "Annotations & Alerts",
            "type": "dashboard"
          }
        ]
      },
      "editable": true,
      "gnetId": null,
      "graphTooltip": 0,
      "id": 3,
      "links": [],
      "panels": [
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 0
          },
          "hiddenSeries": false,
          "id": 1,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "rate(ollama_requests_total{kubernetes_pod_name=~\"ollama-safety.*\"}[1m])",
              "refId": "A",
              "legendFormat": "{{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama Request Rate",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "reqps",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 0
          },
          "hiddenSeries": false,
          "id": 2,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "histogram_quantile(0.95, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))",
              "refId": "A",
              "legendFormat": "p95"
            },
            {
              "expr": "histogram_quantile(0.50, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))",
              "refId": "B",
              "legendFormat": "p50"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama Request Latency",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "s",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 8
          },
          "hiddenSeries": false,
          "id": 3,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "ollama_active_requests{kubernetes_pod_name=~\"ollama-safety.*\"}",
              "refId": "A",
              "legendFormat": "{{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama Active Requests",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 8
          },
          "hiddenSeries": false,
          "id": 4,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "ollama_tokens_per_second{kubernetes_pod_name=~\"ollama-safety.*\"}",
              "refId": "A",
              "legendFormat": "{{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama Tokens Per Second",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 16
          },
          "hiddenSeries": false,
          "id": 5,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum by(kubernetes_pod_name) (container_memory_usage_bytes{pod=~\"ollama-safety.*\"})",
              "refId": "A",
              "legendFormat": "{{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama Memory Usage",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "bytes",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 16
          },
          "hiddenSeries": false,
          "id": 6,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "sum by(kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{pod=~\"ollama-safety.*\"}[5m]))",
              "refId": "A",
              "legendFormat": "{{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama CPU Usage",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 0,
            "y": 24
          },
          "hiddenSeries": false,
          "id": 7,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "ollama_gpu_memory_used_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}",
              "refId": "A",
              "legendFormat": "Used - {{kubernetes_pod_name}}"
            },
            {
              "expr": "ollama_gpu_memory_total_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}",
              "refId": "B",
              "legendFormat": "Total - {{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama GPU Memory",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "bytes",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        },
        {
          "aliasColors": {},
          "bars": false,
          "dashLength": 10,
          "dashes": false,
          "datasource": "Prometheus",
          "fill": 1,
          "fillGradient": 0,
          "gridPos": {
            "h": 8,
            "w": 12,
            "x": 12,
            "y": 24
          },
          "hiddenSeries": false,
          "id": 8,
          "legend": {
            "avg": false,
            "current": true,
            "max": true,
            "min": false,
            "show": true,
            "total": false,
            "values": true
          },
          "lines": true,
          "linewidth": 1,
          "nullPointMode": "null",
          "options": {
            "dataLinks": []
          },
          "percentage": false,
          "pointradius": 2,
          "points": false,
          "renderer": "flot",
          "seriesOverrides": [],
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
              "expr": "ollama_gpu_utilization{kubernetes_pod_name=~\"ollama-safety.*\"}",
              "refId": "A",
              "legendFormat": "{{kubernetes_pod_name}}"
            }
          ],
          "thresholds": [],
          "timeFrom": null,
          "timeRegions": [],
          "timeShift": null,
          "title": "Ollama GPU Utilization",
          "tooltip": {
            "shared": true,
            "sort": 0,
            "value_type": "individual"
          },
          "type": "graph",
          "xaxis": {
            "buckets": null,
            "mode": "time",
            "name": null,
            "show": true,
            "values": []
          },
          "yaxes": [
            {
              "format": "percent",
              "label": null,
              "logBase": 1,
              "max": "100",
              "min": "0",
              "show": true
            },
            {
              "format": "short",
              "label": null,
              "logBase": 1,
              "max": null,
              "min": null,
              "show": true
            }
          ],
          "yaxis": {
            "align": false,
            "alignLevel": null
          }
        }
      ],
      "schemaVersion": 22,
      "style": "dark",
      "tags": [],
      "templating": {
        "list": []
      },
      "time": {
        "from": "now-6h",
        "to": "now"
      },
      "timepicker": {
        "refresh_intervals": [
          "5s",
          "10s",
          "30s",
          "1m",
          "5m",
          "15m",
          "30m",
          "1h",
          "2h",
          "1d"
        ]
      },
      "timezone": "",
      "title": "Ollama Monitoring",
      "uid": "ollama-dashboard",
      "version": 1
    }
  k8s-pods-dashboard.json: |
    {
      "annotations": {
@ -1378,8 +2159,9 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "gpu_utilization{model_name=~\".+\"}",
+              "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\"}",
-              "refId": "A"
+              "refId": "A",
              "legendFormat": "GPU {{gpu}}"
            }
          ],
          "thresholds": [],
@ -1464,8 +2246,9 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "gpu_memory_used_bytes{model_name=~\".+\"} / gpu_memory_total_bytes{model_name=~\".+\"} * 100",
+              "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} / (DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} + DCGM_FI_DEV_FB_FREE{instance=\"dcgm-exporter:9400\"}) * 100",
-              "refId": "A"
+              "refId": "A",
              "legendFormat": "GPU {{gpu}}"
            }
          ],
          "thresholds": [],
@ -1550,8 +2333,9 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "gpu_power_usage_watts{model_name=~\".+\"}",
+              "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\"}",
-              "refId": "A"
+              "refId": "A",
              "legendFormat": "GPU {{gpu}}"
            }
          ],
          "thresholds": [],
@ -2298,20 +3082,65 @@ spec:
    spec:
      containers:
      - name: dcgm-exporter
-        image: nvidia/dcgm-exporter:2.4.6-2.6.10-ubuntu20.04
+        image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04
        securityContext:
          runAsNonRoot: false
          runAsUser: 0
        ports:
        - containerPort: 9400
          name: metrics
        args:
        - -f
        - /etc/dcgm-exporter/dcp-metrics-included.csv
        volumeMounts:
        - name: device-metrics
          mountPath: /dev/metrics
        - name: dcgm-config
          mountPath: /etc/dcgm-exporter
      volumes:
      - name: device-metrics
        hostPath:
          path: /dev/metrics
      - name: dcgm-config
        configMap:
          name: dcgm-config
 ---
 # DCGM Exporter ConfigMap for metrics configuration
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: dcgm-config
  namespace: monitoring
 data:
  dcp-metrics-included.csv: |
    # Format:
    # DCGM_FI_DEV_<fieldname>, <description>, <units>
    DCGM_FI_DEV_GPU_UTIL, GPU utilization, %
    DCGM_FI_DEV_MEM_COPY_UTIL, Memory utilization, %
    DCGM_FI_DEV_FB_FREE, Framebuffer memory free, MiB
    DCGM_FI_DEV_FB_USED, Framebuffer memory used, MiB
    DCGM_FI_DEV_FB_TOTAL, Framebuffer memory total, MiB
    DCGM_FI_DEV_GPU_TEMP, GPU temperature, C
    DCGM_FI_DEV_POWER_USAGE, Power usage, W
    DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, Total energy consumption, mJ
    DCGM_FI_DEV_PCIE_REPLAY_COUNTER, PCIe replay counter, count
    DCGM_FI_DEV_PCIE_TX_THROUGHPUT, PCIe transmit throughput, KiB/s
    DCGM_FI_DEV_PCIE_RX_THROUGHPUT, PCIe receive throughput, KiB/s
    DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, NVLink bandwidth total, KiB/s
    DCGM_FI_DEV_VGPU_LICENSE_STATUS, vGPU license status, N/A
    DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, Uncorrectable remapped rows, count
    DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, Correctable remapped rows, count
    DCGM_FI_DEV_ROW_REMAP_FAILURE, Row remap failure, count
    DCGM_FI_DEV_XID_ERRORS, XID errors, count
    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, NVLink CRC flit error count total, count
    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, NVLink CRC data error count total, count
    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, NVLink replay error count total, count
    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, NVLink recovery error count total, count
    DCGM_FI_DEV_RETIRED_PAGES_SBE, Retired pages SBE, count
    DCGM_FI_DEV_RETIRED_PAGES_DBE, Retired pages DBE, count
    DCGM_FI_DEV_RETIRED_PAGES_PENDING, Retired pages pending, count
    DCGM_FI_DEV_GRAPHICS_PIDS, Graphics processes, count
    DCGM_FI_DEV_COMPUTE_PIDS, Compute processes, count
 ---
 # NVIDIA DCGM Exporter Service
 apiVersion: v1
--- a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
@ -0,0 +1,74 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: ollama-models-safety
 spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
  resources:
    requests:
      storage: 10Gi
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: ollama-safety
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: ollama-safety
  template:
    metadata:
      labels:
        app.kubernetes.io/name: ollama-safety
        workload-type: inference
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "11434"
        prometheus.io/path: "/metrics"
    spec:
      containers:
      - name: ollama-safety
        image: ollama/ollama:latest
        command: ["/bin/sh", "-c"]
        args: [
          "ollama serve & sleep 5 && ollama pull llama-guard3:1b && ollama run llama-guard3:1b & wait"
        ]
        env:
        - name: OLLAMA_HOST
          value: "0.0.0.0"
        - name: OLLAMA_PORT
          value: "11434"
        - name: OLLAMA_ENABLE_METRICS
          value: "true"
        ports:
          - containerPort: 11434
        resources:
            requests:
              memory: "8Gi"
              cpu: "6000m"
            limits:
              memory: "16Gi"
              cpu: "6000m"
        volumeMounts:
          - name: ollama-storage
            mountPath: /root/.ollama
      volumes:
      - name: ollama-storage
        persistentVolumeClaim:
          claimName: ollama-models-safety
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: ollama-server-safety
 spec:
  selector:
    app.kubernetes.io/name: ollama-safety
  ports:
  - protocol: TCP
    port: 11434
    targetPort: 11434
  type: ClusterIP
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -25,6 +25,10 @@ data:
          url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
          api_key: ${env.NVIDIA_API_KEY:=}
          append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
      - provider_id: ollama-safety
        provider_type: remote::ollama
        config:
          url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -108,6 +112,12 @@ data:
      model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
      provider_id: nvidia
      model_type: llm
    - metadata: {}
      model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b}
      provider_id: ollama-safety
      model_type: llm
    shields:
    - shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -55,6 +55,8 @@ spec:
          value: "3072"
        - name: NVIDIA_BASE_URL
          value: http://llm-nim-code.default.svc.cluster.local:8000
        - name: OLLAMA_BASE_URL
          value: http://ollama-safety.default.svc.cluster.local:8000
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
@ -67,6 +69,8 @@ spec:
          value: "${CODE_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
        - name: OLLAMA_MODLE
          value: "${OLLAMA_MODEL}"
        command: ["/bin/sh"]
        args:
          - -c
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -22,6 +22,10 @@ providers:
      url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: ollama-safety
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
@ -105,6 +109,12 @@ models:
  model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
  provider_id: nvidia
  model_type: llm
 - metadata: {}
  model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b}
  provider_id: ollama-safety
  model_type: llm
 shields:
 - shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []