mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-23 16:37:28 +00:00
2334 lines
58 KiB
Text
2334 lines
58 KiB
Text
---
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: monitoring
|
|
---
|
|
# Prometheus ConfigMap
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-config
|
|
namespace: monitoring
|
|
data:
|
|
prometheus.yml: |
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
# NVIDIA DCGM exporter for GPU metrics
|
|
- job_name: 'nvidia-dcgm-exporter'
|
|
static_configs:
|
|
- targets: ['dcgm-exporter:9400']
|
|
|
|
- job_name: 'kubernetes-pods'
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
target_label: __address__
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_pod_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
|
|
# Specific job for vLLM pods
|
|
- job_name: 'vllm-pods'
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_label_app]
|
|
regex: vllm
|
|
action: keep
|
|
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
regex: http
|
|
action: keep
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
|
|
# Specific job for NIM pods
|
|
- job_name: 'nim-pods'
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_label_nim_type]
|
|
regex: llama-nim
|
|
action: keep
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
target_label: __address__
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
- source_labels: [__meta_kubernetes_pod_label_app]
|
|
action: replace
|
|
target_label: app
|
|
|
|
# Direct scrape for NIM service
|
|
- job_name: 'nim-service'
|
|
static_configs:
|
|
- targets: ['llm-nim-code:8000']
|
|
metrics_path: /v1/metrics
|
|
scrape_interval: 5s
|
|
---
|
|
# Prometheus Deployment
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: prometheus
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
serviceAccountName: prometheus
|
|
containers:
|
|
- name: prometheus
|
|
image: prom/prometheus:v2.45.0
|
|
args:
|
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
- "--storage.tsdb.path=/prometheus"
|
|
- "--web.console.libraries=/etc/prometheus/console_libraries"
|
|
- "--web.console.templates=/etc/prometheus/consoles"
|
|
- "--web.enable-lifecycle"
|
|
ports:
|
|
- containerPort: 9090
|
|
name: http
|
|
volumeMounts:
|
|
- name: prometheus-config
|
|
mountPath: /etc/prometheus/
|
|
- name: prometheus-storage
|
|
mountPath: /prometheus
|
|
volumes:
|
|
- name: prometheus-config
|
|
configMap:
|
|
name: prometheus-config
|
|
- name: prometheus-storage
|
|
emptyDir: {}
|
|
---
|
|
# Prometheus Service
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
annotations:
|
|
prometheus.io/scrape: 'true'
|
|
prometheus.io/port: '9090'
|
|
spec:
|
|
selector:
|
|
app: prometheus
|
|
ports:
|
|
- port: 9090
|
|
targetPort: 9090
|
|
name: http
|
|
type: NodePort # Changed from ClusterIP to NodePort to make it accessible outside the cluster
|
|
---
|
|
# Grafana ConfigMap for datasources
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-datasources
|
|
namespace: monitoring
|
|
data:
|
|
prometheus.yaml: |
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
url: http://prometheus:9090
|
|
access: proxy
|
|
isDefault: true
|
|
---
|
|
# Grafana ConfigMap for dashboards provider
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboards-provider
|
|
namespace: monitoring
|
|
data:
|
|
dashboards.yaml: |
|
|
apiVersion: 1
|
|
providers:
|
|
- name: 'default'
|
|
orgId: 1
|
|
folder: ''
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards
|
|
---
|
|
# Grafana ConfigMap for dashboards
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboards
|
|
namespace: monitoring
|
|
data:
|
|
nim-dashboard.json: |
|
|
{
|
|
"annotations": {
|
|
"list": [
|
|
{
|
|
"builtIn": 1,
|
|
"datasource": "-- Grafana --",
|
|
"enable": true,
|
|
"hide": true,
|
|
"iconColor": "rgba(0, 211, 255, 1)",
|
|
"name": "Annotations & Alerts",
|
|
"type": "dashboard"
|
|
}
|
|
]
|
|
},
|
|
"editable": true,
|
|
"gnetId": null,
|
|
"graphTooltip": 0,
|
|
"id": 2,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 1,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_GPU_UTIL{pod=~\"llm-nim-code.*\"}",
|
|
"refId": "A",
|
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM GPU Utilization",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "percent",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": "100",
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 2,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_FB_USED{pod=~\"llm-nim-code.*\"} / 1024 / 1024 / 1024",
|
|
"refId": "A",
|
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM GPU Memory Used (GB)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "gbytes",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 3,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_POWER_USAGE{pod=~\"llm-nim-code.*\"}",
|
|
"refId": "A",
|
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM GPU Power Usage (W)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "watt",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 8
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 4,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "DCGM_FI_DEV_GPU_TEMP{pod=~\"llm-nim-code.*\"}",
|
|
"refId": "A",
|
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM GPU Temperature (°C)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "celsius",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 5,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "rate(nim_requests_total{pod=~\"llm-nim-code.*\"}[1m])",
|
|
"refId": "A",
|
|
"legendFormat": "{{pod}} - {{endpoint}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM Request Rate",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "reqps",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 6,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(nim_request_duration_seconds_bucket{pod=~\"llm-nim-code.*\"}[5m])) by (le, endpoint))",
|
|
"refId": "A",
|
|
"legendFormat": "p95 - {{endpoint}}"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(nim_request_duration_seconds_bucket{pod=~\"llm-nim-code.*\"}[5m])) by (le, endpoint))",
|
|
"refId": "B",
|
|
"legendFormat": "p50 - {{endpoint}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM Request Latency",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "s",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 24
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 7,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "nim_active_requests{pod=~\"llm-nim-code.*\"}",
|
|
"refId": "A",
|
|
"legendFormat": "{{pod}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM Active Requests",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 24
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 8,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "nim_tokens_per_second{pod=~\"llm-nim-code.*\"}",
|
|
"refId": "A",
|
|
"legendFormat": "{{pod}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "NIM Tokens Per Second",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
}
|
|
],
|
|
"schemaVersion": 22,
|
|
"style": "dark",
|
|
"tags": [],
|
|
"templating": {
|
|
"list": []
|
|
},
|
|
"time": {
|
|
"from": "now-6h",
|
|
"to": "now"
|
|
},
|
|
"timepicker": {
|
|
"refresh_intervals": [
|
|
"5s",
|
|
"10s",
|
|
"30s",
|
|
"1m",
|
|
"5m",
|
|
"15m",
|
|
"30m",
|
|
"1h",
|
|
"2h",
|
|
"1d"
|
|
]
|
|
},
|
|
"timezone": "",
|
|
"title": "NVIDIA NIM Monitoring",
|
|
"uid": "nim-dashboard",
|
|
"version": 1
|
|
}
|
|
k8s-pods-dashboard.json: |
|
|
{
|
|
"annotations": {
|
|
"list": [
|
|
{
|
|
"builtIn": 1,
|
|
"datasource": "-- Grafana --",
|
|
"enable": true,
|
|
"hide": true,
|
|
"iconColor": "rgba(0, 211, 255, 1)",
|
|
"name": "Annotations & Alerts",
|
|
"type": "dashboard"
|
|
}
|
|
]
|
|
},
|
|
"editable": true,
|
|
"gnetId": null,
|
|
"graphTooltip": 0,
|
|
"id": 1,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 11,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_cache_usage_perc{model_name=~\".+\"} * 100",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU KV-Cache Usage (%)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "percent",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": "100",
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 12,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "num_requests_running{model_name=~\".+\"}",
|
|
"refId": "A",
|
|
"legendFormat": "Running - {{model_name}}"
|
|
},
|
|
{
|
|
"expr": "num_requests_waiting{model_name=~\".+\"}",
|
|
"refId": "B",
|
|
"legendFormat": "Waiting - {{model_name}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Request Queue",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 24
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 13,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(time_to_first_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))",
|
|
"refId": "A",
|
|
"legendFormat": "p95 - {{model_name}}"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(time_to_first_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))",
|
|
"refId": "B",
|
|
"legendFormat": "p50 - {{model_name}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Time to First Token (seconds)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "s",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 24
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 14,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(time_per_output_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))",
|
|
"refId": "A",
|
|
"legendFormat": "p95 - {{model_name}}"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(time_per_output_token_seconds_bucket{model_name=~\".+\"}[5m])) by (le, model_name))",
|
|
"refId": "B",
|
|
"legendFormat": "p50 - {{model_name}}"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Time per Output Token (seconds)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "s",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 8,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_utilization{model_name=~\".+\"}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Utilization",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "percent",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": "100",
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 9,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_memory_used_bytes{model_name=~\".+\"} / gpu_memory_total_bytes{model_name=~\".+\"} * 100",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Memory Utilization",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "percent",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": "100",
|
|
"min": "0",
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 10,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_power_usage_watts{model_name=~\".+\"}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Power Usage (Watts)",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "celsius",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 2,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "CPU Usage",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 3,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "sum by(pod) (container_memory_usage_bytes{pod=~\"vllm.*|llama-nim.*\"})",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Memory Usage",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "bytes",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 4,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "sum by(pod) (rate(container_network_receive_bytes_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Network Receive",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "bytes",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 8
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 5,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "sum by(pod) (rate(container_network_transmit_bytes_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Network Transmit",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "bytes",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 6,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "sum by(pod) (rate(container_fs_reads_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Disk Reads",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "Prometheus",
|
|
"fill": 1,
|
|
"fillGradient": 0,
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 16
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 7,
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 1,
|
|
"nullPointMode": "null",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 2,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "sum by(pod) (rate(container_fs_writes_total{pod=~\"vllm.*|llama-nim.*\"}[5m]))",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Disk Writes",
|
|
"tooltip": {
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "individual"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"label": null,
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
}
|
|
],
|
|
"schemaVersion": 22,
|
|
"style": "dark",
|
|
"tags": [],
|
|
"templating": {
|
|
"list": []
|
|
},
|
|
"time": {
|
|
"from": "now-6h",
|
|
"to": "now"
|
|
},
|
|
"timepicker": {
|
|
"refresh_intervals": [
|
|
"5s",
|
|
"10s",
|
|
"30s",
|
|
"1m",
|
|
"5m",
|
|
"15m",
|
|
"30m",
|
|
"1h",
|
|
"2h",
|
|
"1d"
|
|
]
|
|
},
|
|
"timezone": "",
|
|
"title": "Llama Stack Monitoring",
|
|
"uid": "llama-stack",
|
|
"version": 1
|
|
}
|
|
---
|
|
# Grafana Deployment
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: grafana
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
containers:
|
|
- name: grafana
|
|
image: grafana/grafana:10.0.3
|
|
ports:
|
|
- containerPort: 3000
|
|
name: http
|
|
volumeMounts:
|
|
- name: grafana-storage
|
|
mountPath: /var/lib/grafana
|
|
- name: grafana-datasources
|
|
mountPath: /etc/grafana/provisioning/datasources
|
|
- name: grafana-dashboards-provider
|
|
mountPath: /etc/grafana/provisioning/dashboards
|
|
- name: grafana-dashboards
|
|
mountPath: /var/lib/grafana/dashboards
|
|
env:
|
|
- name: GF_SECURITY_ADMIN_USER
|
|
value: admin
|
|
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
value: admin # In production, use a secret
|
|
- name: GF_USERS_ALLOW_SIGN_UP
|
|
value: "false"
|
|
volumes:
|
|
- name: grafana-storage
|
|
emptyDir: {}
|
|
- name: grafana-datasources
|
|
configMap:
|
|
name: grafana-datasources
|
|
- name: grafana-dashboards-provider
|
|
configMap:
|
|
name: grafana-dashboards-provider
|
|
- name: grafana-dashboards
|
|
configMap:
|
|
name: grafana-dashboards
|
|
---
|
|
# Grafana Service
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
spec:
|
|
selector:
|
|
app: grafana
|
|
ports:
|
|
- port: 3000
|
|
targetPort: 3000
|
|
name: http
|
|
type: ClusterIP
|
|
---
|
|
# Service Monitor for Prometheus Operator (if using it)
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: vllm-monitor
|
|
namespace: monitoring
|
|
labels:
|
|
release: prometheus
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: vllm
|
|
namespaceSelector:
|
|
matchNames:
|
|
- default
|
|
endpoints:
|
|
- port: http
|
|
interval: 15s
|
|
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: nim-monitor
|
|
namespace: monitoring
|
|
labels:
|
|
release: prometheus
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
nim-type: llama-nim
|
|
namespaceSelector:
|
|
matchNames:
|
|
- default
|
|
endpoints:
|
|
- port: http-openai
|
|
interval: 15s
|
|
---
|
|
# Ingress for Grafana and Prometheus (if needed)
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: Ingress
|
|
metadata:
|
|
name: monitoring-ingress
|
|
namespace: monitoring
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /
|
|
spec:
|
|
rules:
|
|
- host: grafana.local
|
|
http:
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
backend:
|
|
service:
|
|
name: grafana
|
|
port:
|
|
number: 3000
|
|
- host: prometheus.local
|
|
http:
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
backend:
|
|
service:
|
|
name: prometheus
|
|
port:
|
|
number: 9090
|
|
---
|
|
# NVIDIA DCGM Exporter Deployment for GPU metrics
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: dcgm-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: dcgm-exporter
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: dcgm-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: dcgm-exporter
|
|
spec:
|
|
containers:
|
|
- name: dcgm-exporter
|
|
image: nvidia/dcgm-exporter:2.4.6-2.6.10-ubuntu20.04
|
|
securityContext:
|
|
runAsNonRoot: false
|
|
runAsUser: 0
|
|
ports:
|
|
- containerPort: 9400
|
|
name: metrics
|
|
volumeMounts:
|
|
- name: device-metrics
|
|
mountPath: /dev/metrics
|
|
volumes:
|
|
- name: device-metrics
|
|
hostPath:
|
|
path: /dev/metrics
|
|
---
|
|
# NVIDIA DCGM Exporter Service
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: dcgm-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: dcgm-exporter
|
|
annotations:
|
|
prometheus.io/scrape: 'true'
|
|
prometheus.io/port: '9400'
|
|
spec:
|
|
selector:
|
|
app: dcgm-exporter
|
|
ports:
|
|
- port: 9400
|
|
targetPort: 9400
|
|
name: metrics
|
|
type: ClusterIP
|