temp checkpoint

2025-10-23 08:33:09 +00:00 · 2025-08-01 12:34:38 -07:00 · 2025-08-01 12:34:38 -07:00 · 67f19f76b2
commit 67f19f76b2
parent a2bbb17fdd
7 changed files with 1032 additions and 25 deletions
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -18,7 +18,7 @@ export POSTGRES_PASSWORD=llamastack

 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CODE_MODEL=bigcode/starcoder2-7b
-
+export OLLAMA_MODEL=llama-guard3:1b
 # Set USE_EBS to false if you don't have permission to use EKS EBS
 export USE_EBS=${USE_EBS:-false}
 set -euo pipefail
@ -44,10 +44,6 @@ else
  exit 1
 fi

-if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
-  echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
-  exit 1
-fi



@ -84,6 +80,7 @@ echo "Secret verification successful. All required secrets are present."
 if [ "$USE_EBS" = "true" ]; then
  echo "Using EBS storage for persistent volumes"
  envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
+  envsubst < ./ollama-safety-k8s.yaml.template | kubectl apply -f -
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
@ -114,6 +111,7 @@ else
  echo "Using emptyDir for storage (data will not persist across pod restarts)"
  # Process templates to replace EBS storage with emptyDir
  envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
+  envsubst < ./ollama-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -29,48 +29,130 @@ if [ -n "${NGC_API_KEY:-}" ]; then
  export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
 fi

+# Define namespace - default to current namespace if not specified
+export NAMESPACE=${NAMESPACE:-$(kubectl config view --minify -o jsonpath='{..namespace}')}
+if [ -z "$NAMESPACE" ]; then
+  export NAMESPACE="default"
+fi
+
 set -euo pipefail
 set -x

 # Delete resources in reverse order of creation to handle dependencies properly

-# Delete UI deployment
+echo "Starting comprehensive deletion of all LlamaStack resources..."
+
+# Delete UI deployment and service
+echo "Deleting UI resources..."
 envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+# Check for UI service template and delete if exists
+if [ -f "./ui-service-k8s.yaml.template" ]; then
+  envsubst < ./ui-service-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+fi

 # Delete monitoring resources
+echo "Deleting monitoring resources..."
 envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

 # Delete Prometheus RBAC resources
+echo "Deleting Prometheus RBAC resources..."
 kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true

-
-
 # Delete ingress
+echo "Deleting ingress resources..."
 envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

 # Delete stack deployment
+echo "Deleting stack deployment..."
 envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

-# Delete configmap
+# Delete configmaps
+echo "Deleting configmaps..."
 kubectl delete configmap llama-stack-config --ignore-not-found=true
+# Check for stack configmap and delete if exists
+if [ -f "./stack-configmap.yaml" ]; then
+  kubectl delete -f ./stack-configmap.yaml --ignore-not-found=true
+fi

 # Delete chroma deployment
+echo "Deleting chroma deployment..."
 envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

 # Delete postgres deployment
+echo "Deleting postgres deployment..."
 envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

-# Delete vllm-safety deployment
+# Delete llama-nim deployment
+echo "Deleting llama-nim deployment..."
 envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true

+
+# Delete ollama-safety deployment
+echo "Deleting ollama-safety deployment..."
+envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
 # Delete vllm deployment
+echo "Deleting vllm deployment..."
 envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

 # Delete the HF token secret if it exists
 if [ -n "${HF_TOKEN:-}" ]; then
+  echo "Deleting HF token secret..."
  envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
 fi

-# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
+# Delete any other template files that might exist
+echo "Checking for additional template files..."
+for template in ./*.yaml.template; do
+  if [ -f "$template" ]; then
+    # Skip templates we've already processed
+    if [[ "$template" != "./ui-k8s.yaml.template" &&
+          "$template" != "./monitoring-k8s.yaml.template" &&
+          "$template" != "./ingress-k8s.yaml.template" &&
+          "$template" != "./stack-k8s.yaml.template" &&
+          "$template" != "./chroma-k8s.yaml.template" &&
+          "$template" != "./postgres-k8s.yaml.template" &&
+          "$template" != "./llama-nim.yaml.template" &&
+          "$template" != "./vllm-safety-k8s.yaml.template" &&
+          "$template" != "./ollama-safety-k8s.yaml.template" &&
+          "$template" != "./vllm-k8s.yaml.template" &&
+          "$template" != "./set-secret.yaml.template" &&
+          "$template" != "./ui-service-k8s.yaml.template" ]]; then
+      echo "Deleting resources from $template..."
+      envsubst < "$template" | kubectl delete -f - --ignore-not-found=true
+    fi
+  fi
+done

-echo "All LlamaStack Kubernetes resources have been deleted."
+# Delete any PersistentVolumeClaims created by the stack
+echo "Deleting PersistentVolumeClaims..."
+kubectl delete pvc -l app=llama-stack --ignore-not-found=true
+kubectl delete pvc -l app=chroma --ignore-not-found=true
+kubectl delete pvc -l app=postgres --ignore-not-found=true
+kubectl delete pvc -l app=vllm --ignore-not-found=true
+kubectl delete pvc -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
+
+# Delete any remaining services
+echo "Deleting any remaining services..."
+kubectl delete service -l app=llama-stack --ignore-not-found=true
+kubectl delete service -l app=chroma --ignore-not-found=true
+kubectl delete service -l app=postgres --ignore-not-found=true
+kubectl delete service -l app=vllm --ignore-not-found=true
+kubectl delete service -l app=llama-nim --ignore-not-found=true
+kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
+
+# Delete any remaining secrets
+echo "Deleting any remaining secrets..."
+kubectl delete secret hf-secret --ignore-not-found=true
+kubectl delete secret ngc-secret --ignore-not-found=true
+kubectl delete secret -l app=llama-stack --ignore-not-found=true
+
+# Verify no resources remain
+echo "Verifying deletion..."
+REMAINING_RESOURCES=$(kubectl get all -l app=llama-stack 2>/dev/null)
+if [ -z "$REMAINING_RESOURCES" ]; then
+  echo "All LlamaStack Kubernetes resources have been successfully deleted."
+else
+  echo "Some LlamaStack resources may still exist. Please check manually with:"
+  echo "kubectl get all -l app=llama-stack"
+fi
--- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template
+++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template
@ -18,7 +18,7 @@ data:

    scrape_configs:
      # NVIDIA DCGM exporter for GPU metrics
-      - job_name: 'nvidia-dcgm-exporter'
+      - job_name: 'dcgm'
        static_configs:
          - targets: ['dcgm-exporter:9400']

@ -98,6 +98,30 @@ data:
          - targets: ['llm-nim-code:8000']
        metrics_path: /v1/metrics
        scrape_interval: 5s
+
+      # Specific job for Ollama pods
+      - job_name: 'ollama-pods'
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+            regex: ollama-safety
+            action: keep
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+            action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+            target_label: __address__
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
 ---
 # Prometheus Deployment
 apiVersion: apps/v1
@ -262,9 +286,9 @@ data:
          "spaceLength": 10,
          "stack": false,
          "steppedLine": false,
-          "targets": [
+        "targets": [
            {
-              "expr": "DCGM_FI_DEV_GPU_UTIL{pod=~\"llm-nim-code.*\"}",
+              "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -351,7 +375,7 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "DCGM_FI_DEV_FB_USED{pod=~\"llm-nim-code.*\"} / 1024 / 1024 / 1024",
+              "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"} / 1024 / 1024 / 1024",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -438,7 +462,7 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "DCGM_FI_DEV_POWER_USAGE{pod=~\"llm-nim-code.*\"}",
+              "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -525,7 +549,7 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "DCGM_FI_DEV_GPU_TEMP{pod=~\"llm-nim-code.*\"}",
+              "expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
              "refId": "A",
              "legendFormat": "GPU {{gpu}} - {{pod}}"
            }
@ -954,6 +978,763 @@ data:
      "uid": "nim-dashboard",
      "version": 1
    }
+  ollama-dashboard.json: |
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "datasource": "-- Grafana --",
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "type": "dashboard"
+          }
+        ]
+      },
+      "editable": true,
+      "gnetId": null,
+      "graphTooltip": 0,
+      "id": 3,
+      "links": [],
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 0
+          },
+          "hiddenSeries": false,
+          "id": 1,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "rate(ollama_requests_total{kubernetes_pod_name=~\"ollama-safety.*\"}[1m])",
+              "refId": "A",
+              "legendFormat": "{{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama Request Rate",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "reqps",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 0
+          },
+          "hiddenSeries": false,
+          "id": 2,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))",
+              "refId": "A",
+              "legendFormat": "p95"
+            },
+            {
+              "expr": "histogram_quantile(0.50, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))",
+              "refId": "B",
+              "legendFormat": "p50"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama Request Latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "s",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 8
+          },
+          "hiddenSeries": false,
+          "id": 3,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "ollama_active_requests{kubernetes_pod_name=~\"ollama-safety.*\"}",
+              "refId": "A",
+              "legendFormat": "{{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama Active Requests",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 8
+          },
+          "hiddenSeries": false,
+          "id": 4,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "ollama_tokens_per_second{kubernetes_pod_name=~\"ollama-safety.*\"}",
+              "refId": "A",
+              "legendFormat": "{{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama Tokens Per Second",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 16
+          },
+          "hiddenSeries": false,
+          "id": 5,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by(kubernetes_pod_name) (container_memory_usage_bytes{pod=~\"ollama-safety.*\"})",
+              "refId": "A",
+              "legendFormat": "{{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "bytes",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 16
+          },
+          "hiddenSeries": false,
+          "id": 6,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by(kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{pod=~\"ollama-safety.*\"}[5m]))",
+              "refId": "A",
+              "legendFormat": "{{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama CPU Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 24
+          },
+          "hiddenSeries": false,
+          "id": 7,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "ollama_gpu_memory_used_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}",
+              "refId": "A",
+              "legendFormat": "Used - {{kubernetes_pod_name}}"
+            },
+            {
+              "expr": "ollama_gpu_memory_total_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}",
+              "refId": "B",
+              "legendFormat": "Total - {{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama GPU Memory",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "bytes",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "Prometheus",
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 24
+          },
+          "hiddenSeries": false,
+          "id": 8,
+          "legend": {
+            "avg": false,
+            "current": true,
+            "max": true,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "dataLinks": []
+          },
+          "percentage": false,
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "ollama_gpu_utilization{kubernetes_pod_name=~\"ollama-safety.*\"}",
+              "refId": "A",
+              "legendFormat": "{{kubernetes_pod_name}}"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Ollama GPU Utilization",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "percent",
+              "label": null,
+              "logBase": 1,
+              "max": "100",
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        }
+      ],
+      "schemaVersion": 22,
+      "style": "dark",
+      "tags": [],
+      "templating": {
+        "list": []
+      },
+      "time": {
+        "from": "now-6h",
+        "to": "now"
+      },
+      "timepicker": {
+        "refresh_intervals": [
+          "5s",
+          "10s",
+          "30s",
+          "1m",
+          "5m",
+          "15m",
+          "30m",
+          "1h",
+          "2h",
+          "1d"
+        ]
+      },
+      "timezone": "",
+      "title": "Ollama Monitoring",
+      "uid": "ollama-dashboard",
+      "version": 1
+    }
  k8s-pods-dashboard.json: |
    {
      "annotations": {
@ -1378,8 +2159,9 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "gpu_utilization{model_name=~\".+\"}",
-              "refId": "A"
+              "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\"}",
+              "refId": "A",
+              "legendFormat": "GPU {{gpu}}"
            }
          ],
          "thresholds": [],
@ -1464,8 +2246,9 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "gpu_memory_used_bytes{model_name=~\".+\"} / gpu_memory_total_bytes{model_name=~\".+\"} * 100",
-              "refId": "A"
+              "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} / (DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} + DCGM_FI_DEV_FB_FREE{instance=\"dcgm-exporter:9400\"}) * 100",
+              "refId": "A",
+              "legendFormat": "GPU {{gpu}}"
            }
          ],
          "thresholds": [],
@ -1550,8 +2333,9 @@ data:
          "steppedLine": false,
          "targets": [
            {
-              "expr": "gpu_power_usage_watts{model_name=~\".+\"}",
-              "refId": "A"
+              "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\"}",
+              "refId": "A",
+              "legendFormat": "GPU {{gpu}}"
            }
          ],
          "thresholds": [],
@ -2298,20 +3082,65 @@ spec:
    spec:
      containers:
      - name: dcgm-exporter
-        image: nvidia/dcgm-exporter:2.4.6-2.6.10-ubuntu20.04
+        image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04
        securityContext:
          runAsNonRoot: false
          runAsUser: 0
        ports:
        - containerPort: 9400
          name: metrics
+        args:
+        - -f
+        - /etc/dcgm-exporter/dcp-metrics-included.csv
        volumeMounts:
        - name: device-metrics
          mountPath: /dev/metrics
+        - name: dcgm-config
+          mountPath: /etc/dcgm-exporter
      volumes:
      - name: device-metrics
        hostPath:
          path: /dev/metrics
+      - name: dcgm-config
+        configMap:
+          name: dcgm-config
+---
+# DCGM Exporter ConfigMap for metrics configuration
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: dcgm-config
+  namespace: monitoring
+data:
+  dcp-metrics-included.csv: |
+    # Format:
+    # DCGM_FI_DEV_<fieldname>, <description>, <units>
+    DCGM_FI_DEV_GPU_UTIL, GPU utilization, %
+    DCGM_FI_DEV_MEM_COPY_UTIL, Memory utilization, %
+    DCGM_FI_DEV_FB_FREE, Framebuffer memory free, MiB
+    DCGM_FI_DEV_FB_USED, Framebuffer memory used, MiB
+    DCGM_FI_DEV_FB_TOTAL, Framebuffer memory total, MiB
+    DCGM_FI_DEV_GPU_TEMP, GPU temperature, C
+    DCGM_FI_DEV_POWER_USAGE, Power usage, W
+    DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, Total energy consumption, mJ
+    DCGM_FI_DEV_PCIE_REPLAY_COUNTER, PCIe replay counter, count
+    DCGM_FI_DEV_PCIE_TX_THROUGHPUT, PCIe transmit throughput, KiB/s
+    DCGM_FI_DEV_PCIE_RX_THROUGHPUT, PCIe receive throughput, KiB/s
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, NVLink bandwidth total, KiB/s
+    DCGM_FI_DEV_VGPU_LICENSE_STATUS, vGPU license status, N/A
+    DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, Uncorrectable remapped rows, count
+    DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, Correctable remapped rows, count
+    DCGM_FI_DEV_ROW_REMAP_FAILURE, Row remap failure, count
+    DCGM_FI_DEV_XID_ERRORS, XID errors, count
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, NVLink CRC flit error count total, count
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, NVLink CRC data error count total, count
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, NVLink replay error count total, count
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, NVLink recovery error count total, count
+    DCGM_FI_DEV_RETIRED_PAGES_SBE, Retired pages SBE, count
+    DCGM_FI_DEV_RETIRED_PAGES_DBE, Retired pages DBE, count
+    DCGM_FI_DEV_RETIRED_PAGES_PENDING, Retired pages pending, count
+    DCGM_FI_DEV_GRAPHICS_PIDS, Graphics processes, count
+    DCGM_FI_DEV_COMPUTE_PIDS, Compute processes, count
 ---
 # NVIDIA DCGM Exporter Service
 apiVersion: v1
--- a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template
@ -0,0 +1,74 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ollama-models-safety
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 10Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ollama-safety
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: ollama-safety
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: ollama-safety
+        workload-type: inference
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "11434"
+        prometheus.io/path: "/metrics"
+    spec:
+      containers:
+      - name: ollama-safety
+        image: ollama/ollama:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "ollama serve & sleep 5 && ollama pull llama-guard3:1b && ollama run llama-guard3:1b & wait"
+        ]
+        env:
+        - name: OLLAMA_HOST
+          value: "0.0.0.0"
+        - name: OLLAMA_PORT
+          value: "11434"
+        - name: OLLAMA_ENABLE_METRICS
+          value: "true"
+        ports:
+          - containerPort: 11434
+        resources:
+            requests:
+              memory: "8Gi"
+              cpu: "6000m"
+            limits:
+              memory: "16Gi"
+              cpu: "6000m"
+        volumeMounts:
+          - name: ollama-storage
+            mountPath: /root/.ollama
+      volumes:
+      - name: ollama-storage
+        persistentVolumeClaim:
+          claimName: ollama-models-safety
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ollama-server-safety
+spec:
+  selector:
+    app.kubernetes.io/name: ollama-safety
+  ports:
+  - protocol: TCP
+    port: 11434
+    targetPort: 11434
+  type: ClusterIP
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -25,6 +25,10 @@ data:
          url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
          api_key: ${env.NVIDIA_API_KEY:=}
          append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+      - provider_id: ollama-safety
+        provider_type: remote::ollama
+        config:
+          url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -108,6 +112,12 @@ data:
      model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
      provider_id: nvidia
      model_type: llm
+    - metadata: {}
+      model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b}
+      provider_id: ollama-safety
+      model_type: llm
+    shields:
+    - shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -55,6 +55,8 @@ spec:
          value: "3072"
        - name: NVIDIA_BASE_URL
          value: http://llm-nim-code.default.svc.cluster.local:8000
+        - name: OLLAMA_BASE_URL
+          value: http://ollama-safety.default.svc.cluster.local:8000
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
@ -67,6 +69,8 @@ spec:
          value: "${CODE_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
+        - name: OLLAMA_MODLE
+          value: "${OLLAMA_MODEL}"
        command: ["/bin/sh"]
        args:
          - -c
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -22,6 +22,10 @@ providers:
      url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+  - provider_id: ollama-safety
+    provider_type: remote::ollama
+    config:
+      url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
@ -105,6 +109,12 @@ models:
  model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
  provider_id: nvidia
  model_type: llm
+- metadata: {}
+  model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b}
+  provider_id: ollama-safety
+  model_type: llm
+shields:
+- shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []