--- apiVersion: v1 kind: Namespace metadata: name: monitoring --- # Prometheus ConfigMap apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monitoring data: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: # NVIDIA DCGM exporter for GPU metrics - job_name: 'dcgm' static_configs: - targets: ['dcgm-exporter:9400'] - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # Specific job for vLLM pods - job_name: 'vllm-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app] regex: vllm action: keep - source_labels: [__meta_kubernetes_pod_container_port_name] regex: http action: keep - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # Specific job for NIM pods - job_name: 'nim-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_label_nim_type] regex: llama-nim action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - source_labels: [__meta_kubernetes_pod_label_app] action: replace target_label: app # Direct scrape for NIM service - job_name: 'nim-service' static_configs: - targets: ['llm-nim-code:8000'] metrics_path: /v1/metrics scrape_interval: 5s # Specific job for Ollama pods - job_name: 'ollama-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] regex: ollama-safety action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name --- # Prometheus Deployment apiVersion: apps/v1 kind: Deployment metadata: name: prometheus namespace: monitoring labels: app: prometheus spec: replicas: 1 selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus containers: - name: prometheus image: prom/prometheus:v2.45.0 args: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--web.console.libraries=/etc/prometheus/console_libraries" - "--web.console.templates=/etc/prometheus/consoles" - "--web.enable-lifecycle" ports: - containerPort: 9090 name: http volumeMounts: - name: prometheus-config mountPath: /etc/prometheus/ - name: prometheus-storage mountPath: /prometheus volumes: - name: prometheus-config configMap: name: prometheus-config - name: prometheus-storage emptyDir: {} --- # Prometheus Service apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitoring annotations: prometheus.io/scrape: 'true' prometheus.io/port: '9090' spec: selector: app: prometheus ports: - port: 9090 targetPort: 9090 name: http type: NodePort # Changed from ClusterIP to NodePort to make it accessible outside the cluster --- # Grafana ConfigMap for datasources apiVersion: v1 kind: ConfigMap metadata: name: grafana-datasources namespace: monitoring data: prometheus.yaml: | apiVersion: 1 datasources: - name: Prometheus type: prometheus url: http://prometheus:9090 access: proxy isDefault: true --- # Grafana ConfigMap for dashboards provider apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards-provider namespace: monitoring data: dashboards.yaml: | apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards --- # Grafana ConfigMap for dashboards apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards namespace: monitoring data: nim-dashboard.json: | { "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": 2, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 1, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM GPU Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "hiddenSeries": false, "id": 2, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"} / 1024 / 1024 / 1024", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM GPU Memory Used (GB)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "gbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 3, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM GPU Power Usage (W)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "hiddenSeries": false, "id": 4, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM GPU Temperature (°C)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 5, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(nim_requests_total{pod=~\"llm-nim-code.*\"}[1m])", "refId": "A", "legendFormat": "{{pod}} - {{endpoint}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM Request Rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "reqps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "hiddenSeries": false, "id": 6, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(nim_request_duration_seconds_bucket{pod=~\"llm-nim-code.*\"}[5m])) by (le, endpoint))", "refId": "A", "legendFormat": "p95 - {{endpoint}}" }, { "expr": "histogram_quantile(0.50, sum(rate(nim_request_duration_seconds_bucket{pod=~\"llm-nim-code.*\"}[5m])) by (le, endpoint))", "refId": "B", "legendFormat": "p50 - {{endpoint}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM Request Latency", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, "id": 7, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "nim_active_requests{pod=~\"llm-nim-code.*\"}", "refId": "A", "legendFormat": "{{pod}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM Active Requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "hiddenSeries": false, "id": 8, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "nim_tokens_per_second{pod=~\"llm-nim-code.*\"}", "refId": "A", "legendFormat": "{{pod}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "NIM Tokens Per Second", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "schemaVersion": 22, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "", "title": "NVIDIA NIM Monitoring", "uid": "nim-dashboard", "version": 1 } ollama-dashboard.json: | { "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": 3, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 1, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(ollama_requests_total{kubernetes_pod_name=~\"ollama-safety.*\"}[1m])", "refId": "A", "legendFormat": "{{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama Request Rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "reqps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "hiddenSeries": false, "id": 2, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))", "refId": "A", "legendFormat": "p95" }, { "expr": "histogram_quantile(0.50, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))", "refId": "B", "legendFormat": "p50" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama Request Latency", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 3, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "ollama_active_requests{kubernetes_pod_name=~\"ollama-safety.*\"}", "refId": "A", "legendFormat": "{{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama Active Requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "hiddenSeries": false, "id": 4, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "ollama_tokens_per_second{kubernetes_pod_name=~\"ollama-safety.*\"}", "refId": "A", "legendFormat": "{{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama Tokens Per Second", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 5, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(kubernetes_pod_name) (container_memory_usage_bytes{pod=~\"ollama-safety.*\"})", "refId": "A", "legendFormat": "{{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama Memory Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "hiddenSeries": false, "id": 6, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{pod=~\"ollama-safety.*\"}[5m]))", "refId": "A", "legendFormat": "{{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama CPU Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, "id": 7, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "ollama_gpu_memory_used_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}", "refId": "A", "legendFormat": "Used - {{kubernetes_pod_name}}" }, { "expr": "ollama_gpu_memory_total_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}", "refId": "B", "legendFormat": "Total - {{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama GPU Memory", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "hiddenSeries": false, "id": 8, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "ollama_gpu_utilization{kubernetes_pod_name=~\"ollama-safety.*\"}", "refId": "A", "legendFormat": "{{kubernetes_pod_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ollama GPU Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "schemaVersion": 22, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "", "title": "Ollama Monitoring", "uid": "ollama-dashboard", "version": 1 } k8s-pods-dashboard.json: | { "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": 1, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 11, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "gpu_cache_usage_perc{model_name=~\".+\"} * 100", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU KV-Cache Usage (%)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "hiddenSeries": false, "id": 12, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "num_requests_running{model_name=~\".+\"}", "refId": "A", "legendFormat": "Running - {{model_name}}" }, { "expr": "num_requests_waiting{model_name=~\".+\"}", "refId": "B", "legendFormat": "Waiting - {{model_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request Queue", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, "id": 13, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(time_to_first_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))", "refId": "A", "legendFormat": "p95 - {{model_name}}" }, { "expr": "histogram_quantile(0.50, sum(rate(time_to_first_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))", "refId": "B", "legendFormat": "p50 - {{model_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Time to First Token (seconds)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "hiddenSeries": false, "id": 14, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(time_per_output_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))", "refId": "A", "legendFormat": "p95 - {{model_name}}" }, { "expr": "histogram_quantile(0.50, sum(rate(time_per_output_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))", "refId": "B", "legendFormat": "p50 - {{model_name}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Time per Output Token (seconds)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 8, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\"}", "refId": "A", "legendFormat": "GPU {{gpu}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "hiddenSeries": false, "id": 9, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} / (DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} + DCGM_FI_DEV_FB_FREE{instance=\"dcgm-exporter:9400\"}) * 100", "refId": "A", "legendFormat": "GPU {{gpu}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Memory Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 10, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\"}", "refId": "A", "legendFormat": "GPU {{gpu}}" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Power Usage (Watts)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 2, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "hiddenSeries": false, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(pod) (container_memory_usage_bytes{pod=~\"vllm.*|llama-nim.*\"})", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 4, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(pod) (rate(container_network_receive_bytes_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network Receive", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "hiddenSeries": false, "id": 5, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network Transmit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "hiddenSeries": false, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(pod) (rate(container_fs_reads_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Reads", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "fill": 1, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "hiddenSeries": false, "id": 7, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum by(pod) (rate(container_fs_writes_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Writes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "schemaVersion": 22, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "", "title": "Llama Stack Monitoring", "uid": "llama-stack", "version": 1 } --- # Grafana Deployment apiVersion: apps/v1 kind: Deployment metadata: name: grafana namespace: monitoring labels: app: grafana spec: replicas: 1 selector: matchLabels: app: grafana template: metadata: labels: app: grafana spec: containers: - name: grafana image: grafana/grafana:10.0.3 ports: - containerPort: 3000 name: http volumeMounts: - name: grafana-storage mountPath: /var/lib/grafana - name: grafana-datasources mountPath: /etc/grafana/provisioning/datasources - name: grafana-dashboards-provider mountPath: /etc/grafana/provisioning/dashboards - name: grafana-dashboards mountPath: /var/lib/grafana/dashboards env: - name: GF_SECURITY_ADMIN_USER value: admin - name: GF_SECURITY_ADMIN_PASSWORD value: admin # In production, use a secret - name: GF_USERS_ALLOW_SIGN_UP value: "false" volumes: - name: grafana-storage emptyDir: {} - name: grafana-datasources configMap: name: grafana-datasources - name: grafana-dashboards-provider configMap: name: grafana-dashboards-provider - name: grafana-dashboards configMap: name: grafana-dashboards --- # Grafana Service apiVersion: v1 kind: Service metadata: name: grafana namespace: monitoring spec: selector: app: grafana ports: - port: 3000 targetPort: 3000 name: http type: ClusterIP --- # Service Monitor for Prometheus Operator (if using it) apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: vllm-monitor namespace: monitoring labels: release: prometheus spec: selector: matchLabels: app: vllm namespaceSelector: matchNames: - default endpoints: - port: http interval: 15s --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: nim-monitor namespace: monitoring labels: release: prometheus spec: selector: matchLabels: nim-type: llama-nim namespaceSelector: matchNames: - default endpoints: - port: http-openai interval: 15s --- # Ingress for Grafana and Prometheus (if needed) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: monitoring-ingress namespace: monitoring annotations: nginx.ingress.kubernetes.io/rewrite-target: / spec: rules: - host: grafana.local http: paths: - path: / pathType: Prefix backend: service: name: grafana port: number: 3000 - host: prometheus.local http: paths: - path: / pathType: Prefix backend: service: name: prometheus port: number: 9090 --- # NVIDIA DCGM Exporter Deployment for GPU metrics apiVersion: apps/v1 kind: Deployment metadata: name: dcgm-exporter namespace: monitoring labels: app: dcgm-exporter spec: replicas: 1 selector: matchLabels: app: dcgm-exporter template: metadata: labels: app: dcgm-exporter spec: containers: - name: dcgm-exporter image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04 securityContext: runAsNonRoot: false runAsUser: 0 ports: - containerPort: 9400 name: metrics args: - -f - /etc/dcgm-exporter/dcp-metrics-included.csv volumeMounts: - name: device-metrics mountPath: /dev/metrics - name: dcgm-config mountPath: /etc/dcgm-exporter volumes: - name: device-metrics hostPath: path: /dev/metrics - name: dcgm-config configMap: name: dcgm-config --- # DCGM Exporter ConfigMap for metrics configuration apiVersion: v1 kind: ConfigMap metadata: name: dcgm-config namespace: monitoring data: dcp-metrics-included.csv: | # Format: # DCGM_FI_DEV_, , DCGM_FI_DEV_GPU_UTIL, GPU utilization, % DCGM_FI_DEV_MEM_COPY_UTIL, Memory utilization, % DCGM_FI_DEV_FB_FREE, Framebuffer memory free, MiB DCGM_FI_DEV_FB_USED, Framebuffer memory used, MiB DCGM_FI_DEV_FB_TOTAL, Framebuffer memory total, MiB DCGM_FI_DEV_GPU_TEMP, GPU temperature, C DCGM_FI_DEV_POWER_USAGE, Power usage, W DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, Total energy consumption, mJ DCGM_FI_DEV_PCIE_REPLAY_COUNTER, PCIe replay counter, count DCGM_FI_DEV_PCIE_TX_THROUGHPUT, PCIe transmit throughput, KiB/s DCGM_FI_DEV_PCIE_RX_THROUGHPUT, PCIe receive throughput, KiB/s DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, NVLink bandwidth total, KiB/s DCGM_FI_DEV_VGPU_LICENSE_STATUS, vGPU license status, N/A DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, Uncorrectable remapped rows, count DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, Correctable remapped rows, count DCGM_FI_DEV_ROW_REMAP_FAILURE, Row remap failure, count DCGM_FI_DEV_XID_ERRORS, XID errors, count DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, NVLink CRC flit error count total, count DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, NVLink CRC data error count total, count DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, NVLink replay error count total, count DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, NVLink recovery error count total, count DCGM_FI_DEV_RETIRED_PAGES_SBE, Retired pages SBE, count DCGM_FI_DEV_RETIRED_PAGES_DBE, Retired pages DBE, count DCGM_FI_DEV_RETIRED_PAGES_PENDING, Retired pages pending, count DCGM_FI_DEV_GRAPHICS_PIDS, Graphics processes, count DCGM_FI_DEV_COMPUTE_PIDS, Compute processes, count --- # NVIDIA DCGM Exporter Service apiVersion: v1 kind: Service metadata: name: dcgm-exporter namespace: monitoring labels: app: dcgm-exporter annotations: prometheus.io/scrape: 'true' prometheus.io/port: '9400' spec: selector: app: dcgm-exporter ports: - port: 9400 targetPort: 9400 name: metrics type: ClusterIP