diff --git a/docs/source/distributions/k8s/all-manifests.yaml b/docs/source/distributions/k8s/all-manifests.yaml deleted file mode 100644 index 7bee389d1..000000000 --- a/docs/source/distributions/k8s/all-manifests.yaml +++ /dev/null @@ -1,6284 +0,0 @@ ---- -# Source: kube-prometheus-stack/charts/grafana/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -automountServiceAccountToken: true -metadata: - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus ---- -# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -automountServiceAccountToken: true -metadata: - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-kube-state-metrics - namespace: prometheus ---- -# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - namespace: prometheus - labels: - helm.sh/chart: prometheus-node-exporter-4.47.3 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "1.9.1" - release: kube-prometheus-stack-1754164871 -automountServiceAccountToken: false ---- -# Source: kube-prometheus-stack/templates/alertmanager/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kube-prometheus-stack-1754-alertmanager - namespace: prometheus - labels: - app: kube-prometheus-stack-alertmanager - app.kubernetes.io/name: kube-prometheus-stack-alertmanager - app.kubernetes.io/component: alertmanager - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -automountServiceAccountToken: true ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kube-prometheus-stack-1754-operator - namespace: prometheus - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator -automountServiceAccountToken: true ---- -# Source: kube-prometheus-stack/templates/prometheus/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus - labels: - app: kube-prometheus-stack-prometheus - app.kubernetes.io/name: kube-prometheus-stack-prometheus - app.kubernetes.io/component: prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -automountServiceAccountToken: true ---- -# Source: kube-prometheus-stack/charts/grafana/templates/secret.yaml -apiVersion: v1 -kind: Secret -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -type: Opaque -data: - - admin-user: "YWRtaW4=" - admin-password: "cHJvbS1vcGVyYXRvcg==" - ldap-toml: "" ---- -# Source: kube-prometheus-stack/templates/alertmanager/secret.yaml -apiVersion: v1 -kind: Secret -metadata: - name: alertmanager-kube-prometheus-stack-1754-alertmanager - namespace: prometheus - labels: - app: kube-prometheus-stack-alertmanager - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - alertmanager.yaml: "Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0KaW5oaWJpdF9ydWxlczoKLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIC0gYWxlcnRuYW1lCiAgc291cmNlX21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSBjcml0aWNhbAogIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID1+IHdhcm5pbmd8aW5mbwotIGVxdWFsOgogIC0gbmFtZXNwYWNlCiAgLSBhbGVydG5hbWUKICBzb3VyY2VfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IHdhcm5pbmcKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIHNvdXJjZV9tYXRjaGVyczoKICAtIGFsZXJ0bmFtZSA9IEluZm9JbmhpYml0b3IKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBhbGVydG5hbWUgPSBJbmZvSW5oaWJpdG9yCnJlY2VpdmVyczoKLSBuYW1lOiAibnVsbCIKcm91dGU6CiAgZ3JvdXBfYnk6CiAgLSBuYW1lc3BhY2UKICBncm91cF9pbnRlcnZhbDogNW0KICBncm91cF93YWl0OiAzMHMKICByZWNlaXZlcjogIm51bGwiCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByb3V0ZXM6CiAgLSBtYXRjaGVyczoKICAgIC0gYWxlcnRuYW1lID0gIldhdGNoZG9nIgogICAgcmVjZWl2ZXI6ICJudWxsIgp0ZW1wbGF0ZXM6Ci0gL2V0Yy9hbGVydG1hbmFnZXIvY29uZmlnLyoudG1wbA==" ---- -# Source: kube-prometheus-stack/charts/grafana/templates/configmap-dashboard-provider.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" - name: kube-prometheus-stack-1754164871-grafana-config-dashboards - namespace: prometheus -data: - provider.yaml: |- - apiVersion: 1 - providers: - - name: 'sidecarProvider' - orgId: 1 - folder: '' - folderUid: '' - type: file - disableDeletion: false - allowUiUpdates: false - updateIntervalSeconds: 30 - options: - foldersFromFilesStructure: false - path: /tmp/dashboards ---- -# Source: kube-prometheus-stack/charts/grafana/templates/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -data: - - grafana.ini: | - [analytics] - check_for_updates = true - [grafana_net] - url = https://grafana.net - [log] - mode = console - [paths] - data = /var/lib/grafana/ - logs = /var/log/grafana - plugins = /var/lib/grafana/plugins - provisioning = /etc/grafana/provisioning - [server] - domain = '' ---- -# Source: kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: kube-prometheus-stack-1754-grafana-datasource - namespace: prometheus - labels: - grafana_datasource: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - datasource.yaml: |- - apiVersion: 1 - datasources: - - name: "Prometheus" - type: prometheus - uid: prometheus - url: http://kube-prometheus-stack-1754-prometheus.prometheus:9090/ - access: proxy - isDefault: true - jsonData: - httpMethod: POST - timeInterval: 30s - - name: "Alertmanager" - type: alertmanager - uid: alertmanager - url: http://kube-prometheus-stack-1754-alertmanager.prometheus:9093/ - access: proxy - jsonData: - handleGrafanaManagedAlerts: false - implementation: prometheus ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-alertmanager-overview - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - alertmanager-overview.json: |- - {"graphTooltip":1,"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"Alerts","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"description":"current set of alerts stored in the Alertmanager","fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"none"}},"gridPos":{"h":7,"w":12,"x":0,"y":1},"id":2,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(alertmanager_alerts{namespace=~\"$namespace\",service=~\"$service\"}) by (namespace,service,instance)","intervalFactor":2,"legendFormat":"{{instance}}"}],"title":"Alerts","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"description":"rate of successful and invalid alerts received by the Alertmanager","fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":12,"y":1},"id":3,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",service=~\"$service\"}[$__rate_interval])) by (namespace,service,instance)","intervalFactor":2,"legendFormat":"{{instance}} Received"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",service=~\"$service\"}[$__rate_interval])) by (namespace,service,instance)","intervalFactor":2,"legendFormat":"{{instance}} Invalid"}],"title":"Alerts receive rate","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":4,"panels":[],"title":"Notifications","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"description":"rate of successful and invalid notifications sent by the Alertmanager","fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":9},"id":5,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","repeat":"integration","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(alertmanager_notifications_total{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,service,instance)","intervalFactor":2,"legendFormat":"{{instance}} Total"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(alertmanager_notifications_failed_total{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,service,instance)","intervalFactor":2,"legendFormat":"{{instance}} Failed"}],"title":"$integration: Notifications Send Rate","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"description":"latency of notifications sent by the Alertmanager","fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"s"}},"gridPos":{"h":7,"w":12,"x":12,"y":9},"id":6,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","repeat":"integration","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,service,instance)\n)\n","intervalFactor":2,"legendFormat":"{{instance}} 99th Percentile"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,service,instance)\n)\n","intervalFactor":2,"legendFormat":"{{instance}} Median"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(alertmanager_notification_latency_seconds_sum{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=~\"$namespace\",service=~\"$service\", integration=\"$integration\"}[$__rate_interval])) by (namespace,service,instance)\n","intervalFactor":2,"legendFormat":"{{instance}} Average"}],"title":"$integration: Notification Duration","type":"timeseries"}],"schemaVersion":39,"tags":["alertmanager-mixin"],"templating":{"list":[{"current":{"selected":false,"text":"Prometheus","value":"Prometheus"},"hide":0,"label":"Data Source","name":"datasource","query":"prometheus","type":"datasource"},{"current":{"selected":false,"text":"","value":""},"datasource":{"type":"prometheus","uid":"${datasource}"},"includeAll":false,"label":"namespace","name":"namespace","query":"label_values(alertmanager_alerts, namespace)","refresh":2,"sort":1,"type":"query"},{"current":{"selected":false,"text":"","value":""},"datasource":{"type":"prometheus","uid":"${datasource}"},"includeAll":false,"label":"service","name":"service","query":"label_values(alertmanager_alerts, service)","refresh":2,"sort":1,"type":"query"},{"current":{"selected":false,"text":"$__all","value":"$__all"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"includeAll":true,"name":"integration","query":"label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["30s"]},"timezone": "utc","title":"Alertmanager / Overview","uid":"alertmanager-overview"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-apiserver - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - apiserver.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.","gridPos":{"h":2,"w":24,"x":0,"y":0},"id":1,"options":{"content":"The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only."},"pluginVersion":"v11.4.0","title":"Notice","type":"text"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?","fieldConfig":{"defaults":{"decimals":3,"unit":"percentunit"}},"gridPos":{"h":7,"w":8,"x":0,"y":2},"id":2,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}"}],"title":"Availability (30d) > 99.000%","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How much error budget is left looking at our 0.990% availability guarantees?","fieldConfig":{"defaults":{"custom":{"fillOpacity":100},"decimals":3,"unit":"percentunit"}},"gridPos":{"h":7,"w":16,"x":8,"y":2},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)","legendFormat":"errorbudget"}],"title":"ErrorBudget (30d) > 99.000%","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?","fieldConfig":{"defaults":{"decimals":3,"unit":"percentunit"}},"gridPos":{"h":7,"w":6,"x":0,"y":9},"id":4,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}"}],"title":"Read Availability (30d)","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many read requests (LIST,GET) per second do the apiservers get by code?","fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"stacking":{"mode":"normal"}},"unit":"reqps"},"overrides":[{"matcher":{"id":"byRegexp","options":"/2../i"},"properties":[{"id":"color","value":"#56A64B"}]},{"matcher":{"id":"byRegexp","options":"/3../i"},"properties":[{"id":"color","value":"#F2CC0C"}]},{"matcher":{"id":"byRegexp","options":"/4../i"},"properties":[{"id":"color","value":"#3274D9"}]},{"matcher":{"id":"byRegexp","options":"/5../i"},"properties":[{"id":"color","value":"#E02F44"}]}]},"gridPos":{"h":7,"w":6,"x":6,"y":9},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})","legendFormat":"{{ code }}"}],"title":"Read SLI - Requests","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?","fieldConfig":{"defaults":{"min":0,"unit":"percentunit"}},"gridPos":{"h":7,"w":6,"x":12,"y":9},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})","legendFormat":"{{ resource }}"}],"title":"Read SLI - Errors","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many seconds is the 99th percentile for reading (LIST|GET) a given resource?","fieldConfig":{"defaults":{"unit":"s"}},"gridPos":{"h":7,"w":6,"x":18,"y":9},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}","legendFormat":"{{ resource }}"}],"title":"Read SLI - Duration","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?","fieldConfig":{"defaults":{"decimals":3,"unit":"percentunit"}},"gridPos":{"h":7,"w":6,"x":0,"y":16},"id":8,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}"}],"title":"Write Availability (30d)","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?","fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"stacking":{"mode":"normal"}},"unit":"reqps"},"overrides":[{"matcher":{"id":"byRegexp","options":"/2../i"},"properties":[{"id":"color","value":"#56A64B"}]},{"matcher":{"id":"byRegexp","options":"/3../i"},"properties":[{"id":"color","value":"#F2CC0C"}]},{"matcher":{"id":"byRegexp","options":"/4../i"},"properties":[{"id":"color","value":"#3274D9"}]},{"matcher":{"id":"byRegexp","options":"/5../i"},"properties":[{"id":"color","value":"#E02F44"}]}]},"gridPos":{"h":7,"w":6,"x":6,"y":16},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})","legendFormat":"{{ code }}"}],"title":"Write SLI - Requests","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?","fieldConfig":{"defaults":{"min":0,"unit":"percentunit"}},"gridPos":{"h":7,"w":6,"x":12,"y":16},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})","legendFormat":"{{ resource }}"}],"title":"Write SLI - Errors","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"description":"How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?","fieldConfig":{"defaults":{"unit":"s"}},"gridPos":{"h":7,"w":6,"x":18,"y":16},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}","legendFormat":"{{ resource }}"}],"title":"Write SLI - Duration","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"min":0,"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":23},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":false},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name)","legendFormat":"{{instance}} {{name}}"}],"title":"Work Queue Add Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":12,"y":23},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":false},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name)","legendFormat":"{{instance}} {{name}}"}],"title":"Work Queue Depth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"min":0,"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":30},"id":14,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])) by (instance, name, le))","legendFormat":"{{instance}} {{name}}"}],"title":"Work Queue Latency","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":0,"y":37},"id":15,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}","legendFormat":"{{instance}}"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"min":0,"unit":"short"}},"gridPos":{"h":7,"w":8,"x":8,"y":37},"id":16,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[$__rate_interval])","legendFormat":"{{instance}}"}],"title":"CPU usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"short"}},"gridPos":{"h":7,"w":8,"x":16,"y":37},"id":17,"interval":"1m","options":{"legend":{"asTable":true,"placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}","legendFormat":"{{instance}}"}],"title":"Goroutines","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"apiserver\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"name":"instance","query":"label_values(up{job=\"apiserver\", cluster=\"$cluster\"}, instance)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / API server","uid":"09ec8aa1e996d6ffcd6817bbaff4db1b"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-cluster-total - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - cluster-total.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":0,"y":0},"id":1,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Received","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":12,"y":0},"id":2,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Transmitted","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bytes/"},"properties":[{"id":"unit","value":"binBps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Namespace"},"properties":[{"id":"links","value":[{"title":"Drill down","url":"/d/8b7a8b326d7a6f1f04244066368c67af/kubernetes-networking-namespace-pods?${datasource:queryparam}&var-cluster=${cluster}&var-namespace=${__data.fields.Namespace}"}]}]}]},"gridPos":{"h":9,"w":24,"x":0,"y":9},"id":3,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"avg by (namespace) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"avg by (namespace) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true}],"title":"Current Status","transformations":[{"id":"joinByField","options":{"byField":"namespace","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true,"Time 8":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Time 7":6,"Time 8":7,"Value #A":9,"Value #B":10,"Value #C":11,"Value #D":12,"Value #E":13,"Value #F":14,"Value #G":15,"Value #H":16,"namespace":8},"renameByName":{"Value #A":"Rx Bytes","Value #B":"Tx Bytes","Value #C":"Rx Bytes (Avg)","Value #D":"Tx Bytes (Avg)","Value #E":"Rx Packets","Value #F":"Tx Packets","Value #G":"Rx Packets Dropped","Value #H":"Tx Packets Dropped","namespace":"Namespace"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":0,"y":18},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"avg by (namespace) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Average Rate of Bytes Received","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":12,"y":18},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"avg by (namespace) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Average Rate of Bytes Transmitted","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":0,"y":27},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":12,"y":27},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":36},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":36},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":45},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":45},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (namespace) (\n rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"percentunit"}},"gridPos":{"h":9,"w":12,"x":0,"y":54},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (instance) (\n rate(node_netstat_Tcp_RetransSegs{cluster=\"$cluster\"}[$__rate_interval]) / rate(node_netstat_Tcp_OutSegs{cluster=\"$cluster\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of TCP Retransmits out of all sent segments","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"percentunit"}},"gridPos":{"h":9,"w":12,"x":12,"y":54},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (instance) (\n rate(node_netstat_TcpExt_TCPSynRetrans{cluster=\"$cluster\"}[$__rate_interval]) / rate(node_netstat_Tcp_RetransSegs{cluster=\"$cluster\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of TCP SYN Retransmits out of all retransmits","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Networking / Cluster","uid":"ff635a025bcfea7bc3dd4f508990a3e9"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-controller-manager - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - controller-manager.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(up{cluster=\"$cluster\", job=\"kube-controller-manager\"})","instant":true}],"title":"Up","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":20,"x":4,"y":0},"id":2,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, name)","legendFormat":"{{cluster}} {{instance}} {{name}}"}],"title":"Work Queue Add Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":24,"x":0,"y":7},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, name)","legendFormat":"{{cluster}} {{instance}} {{name}}"}],"title":"Work Queue Depth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, name, le))","legendFormat":"{{cluster}} {{instance}} {{name}}"}],"title":"Work Queue Latency","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":8,"x":0,"y":21},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))","legendFormat":"2xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))","legendFormat":"3xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))","legendFormat":"4xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))","legendFormat":"5xx"}],"title":"Kube API Request Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":16,"x":8,"y":21},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, le))","legendFormat":"{{verb}}"}],"title":"Post Request Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":28},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, le))","legendFormat":"{{verb}}"}],"title":"Get Request Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":0,"y":35},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":8,"y":35},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[$__rate_interval])","legendFormat":"{{instance}}"}],"title":"CPU usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":16,"y":35},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"go_goroutines{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Goroutines","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-controller-manager\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"instance","name":"instance","query":"label_values(up{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Controller Manager","uid":"72e0e05bef5099e5f049b05fdc429ed4"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-etcd - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - etcd.json: |- - {"description":"etcd sample Grafana dashboard with Prometheus","panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"gridPos":{"h":7,"w":6,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none","graphMode":"none","reduceOptions":{"calcs":["lastNotNull"]}},"pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(etcd_server_has_leader{job=~\".*etcd.*\", job=\"$cluster\"})","legendFormat":"{{cluster}} - {{namespace}}\n"}],"title":"Up","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"ops"}},"gridPos":{"h":7,"w":10,"x":6,"y":0},"id":2,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(grpc_server_started_total{job=~\".*etcd.*\", job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))","legendFormat":"RPC rate"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))","legendFormat":"RPC failed rate"}],"title":"RPC rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"}}},"gridPos":{"h":7,"w":8,"x":16,"y":0},"id":3,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(grpc_server_started_total{job=~\".*etcd.*\",job=\"$cluster\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"$cluster\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})","legendFormat":"Watch streams"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(grpc_server_started_total{job=~\".*etcd.*\",job=\"$cluster\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"$cluster\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})","legendFormat":"Lease streams"}],"title":"Active streams","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":0,"y":25},"id":4,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"etcd_mvcc_db_total_size_in_bytes{job=~\".*etcd.*\", job=\"$cluster\"}","legendFormat":"{{instance}} DB size"}],"title":"DB size","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"s"}},"gridPos":{"h":7,"w":8,"x":8,"y":25},"id":5,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}} WAL fsync"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}} DB fsync"}],"title":"Disk sync duration","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":16,"y":25},"id":6,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"process_resident_memory_bytes{job=~\".*etcd.*\", job=\"$cluster\"}","legendFormat":"{{instance}} resident memory"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"Bps"}},"gridPos":{"h":7,"w":6,"x":0,"y":50},"id":7,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(etcd_network_client_grpc_received_bytes_total{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])","legendFormat":"{{instance}} client traffic in"}],"title":"Client traffic in","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"Bps"}},"gridPos":{"h":7,"w":6,"x":6,"y":50},"id":8,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(etcd_network_client_grpc_sent_bytes_total{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])","legendFormat":"{{instance}} client traffic out"}],"title":"Client traffic out","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"Bps"}},"gridPos":{"h":7,"w":6,"x":12,"y":50},"id":9,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(etcd_network_peer_received_bytes_total{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])) by (instance)","legendFormat":"{{instance}} peer traffic in"}],"title":"Peer traffic in","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"Bps"}},"gridPos":{"h":7,"w":6,"x":18,"y":50},"id":10,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(etcd_network_peer_sent_bytes_total{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])) by (instance)","legendFormat":"{{instance}} peer traffic out"}],"title":"Peer traffic out","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"}}},"gridPos":{"h":7,"w":8,"x":0,"y":75},"id":11,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"changes(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\", job=\"$cluster\"}[1d])","legendFormat":"{{instance}} total leader elections per day"}],"title":"Raft proposals","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"}}},"gridPos":{"h":7,"w":8,"x":8,"y":75},"id":12,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"changes(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\", job=\"$cluster\"}[1d])","legendFormat":"{{instance}} total leader elections per day"}],"title":"Total leader elections per day","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"lineWidth":2,"showPoints":"never"},"unit":"s"}},"gridPos":{"h":7,"w":8,"x":16,"y":75},"id":13,"interval":"1m","pluginVersion":"v10.0.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\", job=\"$cluster\"}[$__rate_interval])))","legendFormat":"{{instance}} peer round trip time"}],"title":"Peer round trip time","type":"timeseries"}],"refresh":"10s","schemaVersion":36,"tags":["etcd-mixin"],"templating":{"list":[{"label":"Data Source","name":"datasource","query":"prometheus","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"label":"cluster","name":"cluster","query":"label_values(etcd_server_has_leader{job=~\".*etcd.*\"}, job)","refresh":2,"type":"query","allValue":".*","hide":2}]},"time":{"from":"now-15m","to":"now"},"timezone": "utc","title":"etcd","uid":"c2f4e12cdf69feb95caa41a5a1b423d9"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/grafana-overview.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-grafana-overview - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - grafana-overview.json: |- - {"annotations":{"list":[{"builtIn":1,"datasource":"-- Grafana --","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","target":{"limit":100,"matchAny":false,"tags":[],"type":"dashboard"},"type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":3085,"iteration":1631554945276,"links":[],"panels":[{"datasource":"$datasource","fieldConfig":{"defaults":{"mappings":[],"noValue":"0","thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":6,"x":0,"y":0},"id":6,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"text":{},"textMode":"auto"},"pluginVersion":"8.1.3","targets":[{"expr":"grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}","instant":true,"interval":"1m","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Firing Alerts","type":"stat"},{"datasource":"$datasource","fieldConfig":{"defaults":{"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":6,"x":6,"y":0},"id":8,"options":{"colorMode":"value","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["mean"],"fields":"","values":false},"text":{},"textMode":"auto"},"pluginVersion":"8.1.3","targets":[{"expr":"sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})","interval":"1m","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Dashboards","type":"stat"},{"datasource":"$datasource","fieldConfig":{"defaults":{"custom":{"align":null,"displayMode":"auto"},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]}},"overrides":[]},"gridPos":{"h":5,"w":12,"x":12,"y":0},"id":10,"options":{"showHeader":true},"pluginVersion":"8.1.3","targets":[{"expr":"grafana_build_info{job=~\"$job\", instance=~\"$instance\"}","instant":true,"interval":"1m","legendFormat":"","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Build Info","transformations":[{"id":"labelsToFields","options":{}},{"id":"organize","options":{"excludeByName":{"Time":true,"Value":true,"branch":true,"container":true,"goversion":true,"namespace":true,"pod":true,"revision":true},"indexByName":{"Time":7,"Value":11,"branch":4,"container":8,"edition":2,"goversion":6,"instance":1,"job":0,"namespace":9,"pod":10,"revision":5,"version":3},"renameByName":{}}}],"type":"table"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"$datasource","fieldConfig":{"defaults":{"links":[]},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":0,"y":5},"hiddenSeries":false,"id":2,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"8.1.3","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":true,"steppedLine":false,"targets":[{"expr":"sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ","interval":"1m","legendFormat":"{{status_code}}","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"RPS","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"$$hashKey":"object:157","format":"reqps","label":null,"logBase":1,"max":null,"min":null,"show":true},{"$$hashKey":"object:158","format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}],"yaxis":{"align":false,"alignLevel":null}},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"$datasource","fieldConfig":{"defaults":{"links":[]},"overrides":[]},"fill":1,"fillGradient":0,"gridPos":{"h":8,"w":12,"x":12,"y":5},"hiddenSeries":false,"id":4,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":true},"percentage":false,"pluginVersion":"8.1.3","pointradius":2,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"exemplar":true,"expr":"histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1","interval":"1m","legendFormat":"99th Percentile","refId":"A"},{"exemplar":true,"expr":"histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1","interval":"1m","legendFormat":"50th Percentile","refId":"B"},{"exemplar":true,"expr":"sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))","interval":"1m","legendFormat":"Average","refId":"C"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Request Latency","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"$$hashKey":"object:210","format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true},{"$$hashKey":"object:211","format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}],"yaxis":{"align":false,"alignLevel":null}}],"schemaVersion":30,"tags":[],"templating":{"list":[{"current":{"selected":true,"text":"dev-cortex","value":"dev-cortex"},"description":null,"error":null,"hide":0,"includeAll":false,"label":null,"multi":false,"name":"datasource","options":[],"query":"prometheus","queryValue":"","refresh":1,"regex":"","skipUrlSync":false,"type":"datasource"},{"allValue":".*","current":{"selected":false,"text":["default/grafana"],"value":["default/grafana"]},"datasource":"$datasource","definition":"label_values(grafana_build_info, job)","description":null,"error":null,"hide":0,"includeAll":true,"label":null,"multi":true,"name":"job","options":[],"query":{"query":"label_values(grafana_build_info, job)","refId":"Billing Admin-job-Variable-Query"},"refresh":1,"regex":"","skipUrlSync":false,"sort":0,"tagValuesQuery":"","tagsQuery":"","type":"query","useTags":false},{"allValue":".*","current":{"selected":false,"text":"All","value":"$__all"},"datasource":"$datasource","definition":"label_values(grafana_build_info, instance)","description":null,"error":null,"hide":0,"includeAll":true,"label":null,"multi":true,"name":"instance","options":[],"query":{"query":"label_values(grafana_build_info, instance)","refId":"Billing Admin-instance-Variable-Query"},"refresh":1,"regex":"","skipUrlSync":false,"sort":0,"tagValuesQuery":"","tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-6h","to":"now"},"timepicker":{"refresh_intervals":["10s","30s","1m","5m","15m","30m","1h","2h","1d"]},"timezone": "utc","title":"Grafana Overview","uid":"6be0s85Mk","version":2} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-coredns - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-coredns.json: |- - {"annotations":{"list":[{"builtIn":1,"datasource":{"type":"datasource","uid":"grafana"},"enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"description":"A dashboard for the CoreDNS DNS server with updated metrics for version 1.7.0+. Based on the CoreDNS dashboard by buhay.","editable":true,"fiscalYearStartMonth":0,"gnetId":12539,"graphTooltip":0,"id":7,"links":[{"icon":"external link","tags":[],"targetBlank":true,"title":"CoreDNS.io","type":"link","url":"https://coredns.io"}],"liveNow":false,"panels":[{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"normal"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"pps","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":8,"x":0,"y":0},"id":2,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_dns_request_count_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (proto) or\nsum(rate(coredns_dns_requests_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (proto)","format":"time_series","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}","refId":"A","step":60}],"title":"Requests (total)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"normal"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"pps","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":8,"x":8,"y":0},"id":4,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_dns_request_type_count_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (type) or \nsum(rate(coredns_dns_requests_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (type)","interval":"1m","intervalFactor":2,"legendFormat":"{{ type }}","refId":"A","step":60}],"title":"Requests (by qtype)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"normal"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"pps","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":8,"x":16,"y":0},"id":6,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_dns_request_count_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (zone) or\nsum(rate(coredns_dns_requests_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (zone)","interval":"1m","intervalFactor":2,"legendFormat":"{{ zone }}","refId":"A","step":60}],"title":"Requests (by zone)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"none"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"pps","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":12,"x":0,"y":7},"id":8,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_dns_request_do_count_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) or\nsum(rate(coredns_dns_do_requests_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m]))","interval":"1m","intervalFactor":2,"legendFormat":"DO","refId":"A","step":40},{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_dns_request_count_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) or\nsum(rate(coredns_dns_requests_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m]))","interval":"1m","intervalFactor":2,"legendFormat":"total","refId":"B","step":40}],"title":"Requests (DO bit)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"none"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes","unitScale":true},"overrides":[{"matcher":{"id":"byName","options":"tcp:90"},"properties":[{"id":"unit","value":"short"}]},{"matcher":{"id":"byName","options":"tcp:99 "},"properties":[{"id":"unit","value":"short"}]},{"matcher":{"id":"byName","options":"tcp:50"},"properties":[{"id":"unit","value":"short"}]}]},"gridPos":{"h":7,"w":6,"x":12,"y":7},"id":10,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"none"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.99, (sum(rate(coredns_dns_request_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_request_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto)))","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}:99 ","refId":"A","step":60},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.90, (sum(rate(coredns_dns_request_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_request_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto)))","intervalFactor":2,"legendFormat":"{{ proto }}:90","refId":"B","step":60},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.50, (sum(rate(coredns_dns_request_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_request_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto)))","intervalFactor":2,"legendFormat":"{{ proto }}:50","refId":"C","step":60}],"title":"Requests (size, udp)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"none"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":6,"x":18,"y":7},"id":12,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"none"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.99, (sum(rate(coredns_dns_request_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_request_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)))","format":"time_series","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}:99 ","refId":"A","step":60},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.90, (sum(rate(coredns_dns_request_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_request_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)))","format":"time_series","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}:90","refId":"B","step":60},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.50, (sum(rate(coredns_dns_request_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_request_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)))","format":"time_series","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}:50","refId":"C","step":60}],"title":"Requests (size,tcp)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"normal"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"pps","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":12,"x":0,"y":14},"id":14,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_dns_response_rcode_count_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (rcode) or\nsum(rate(coredns_dns_responses_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (rcode)","interval":"1m","intervalFactor":2,"legendFormat":"{{ rcode }}","refId":"A","step":40}],"title":"Responses (by rcode)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"none"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"s","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":12,"x":12,"y":14},"id":32,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"none"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.99, (sum(rate(coredns_dns_request_duration_seconds{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (job)) or (sum(rate(coredns_dns_request_duration_seconds_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (le, job)))","format":"time_series","intervalFactor":2,"legendFormat":"99%","refId":"A","step":40},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.90, (sum(rate(coredns_dns_request_duration_seconds{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by ()) or (sum(rate(coredns_dns_request_duration_seconds_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (le)))","format":"time_series","intervalFactor":2,"legendFormat":"90%","refId":"B","step":40},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.50, (sum(rate(coredns_dns_request_duration_seconds{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by ()) or (sum(rate(coredns_dns_request_duration_seconds_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (le)))","format":"time_series","intervalFactor":2,"legendFormat":"50%","refId":"C","step":40}],"title":"Responses (duration)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"none"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes","unitScale":true},"overrides":[{"matcher":{"id":"byName","options":"tcp:50%"},"properties":[{"id":"unit","value":"short"}]},{"matcher":{"id":"byName","options":"tcp:90%"},"properties":[{"id":"unit","value":"short"}]},{"matcher":{"id":"byName","options":"tcp:99%"},"properties":[{"id":"unit","value":"short"}]}]},"gridPos":{"h":7,"w":12,"x":0,"y":21},"id":18,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"none"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.99, (sum(rate(coredns_dns_response_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_response_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto))) ","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}:99%","refId":"A","step":40},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.90, (sum(rate(coredns_dns_response_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_response_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto))) ","interval":"1m","intervalFactor":2,"legendFormat":"{{ proto }}:90%","refId":"B","step":40},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.50, (sum(rate(coredns_dns_response_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_response_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto))) ","hide":false,"intervalFactor":2,"legendFormat":"{{ proto }}:50%","metric":"","refId":"C","step":40}],"title":"Responses (size, udp)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"none"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"bytes","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":12,"x":12,"y":21},"id":20,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"none"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.99, (sum(rate(coredns_dns_response_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_response_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))) ","format":"time_series","intervalFactor":2,"legendFormat":"{{ proto }}:99%","refId":"A","step":40},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.90, (sum(rate(coredns_dns_response_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_response_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))) ","format":"time_series","intervalFactor":2,"legendFormat":"{{ proto }}:90%","refId":"B","step":40},{"datasource":{"uid":"$datasource"},"expr":"histogram_quantile(0.50, (sum(rate(coredns_dns_response_size_bytes{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (proto)) or (sum(rate(coredns_dns_response_size_bytes_bucket{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))) ","format":"time_series","intervalFactor":2,"legendFormat":"{{ proto }}:50%","metric":"","refId":"C","step":40}],"title":"Responses (size, tcp)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"normal"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"decbytes","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":12,"x":0,"y":28},"id":22,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(coredns_cache_size{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}) by (type) or\nsum(coredns_cache_entries{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}) by (type)","interval":"1m","intervalFactor":2,"legendFormat":"{{ type }}","refId":"A","step":40}],"title":"Cache (size)","type":"timeseries"},{"datasource":{"uid":"$datasource"},"fieldConfig":{"defaults":{"color":{"mode":"palette-classic"},"custom":{"axisBorderShow":false,"axisCenteredZero":false,"axisColorMode":"text","axisLabel":"","axisPlacement":"auto","barAlignment":0,"drawStyle":"line","fillOpacity":10,"gradientMode":"none","hideFrom":{"legend":false,"tooltip":false,"viz":false},"insertNulls":false,"lineInterpolation":"linear","lineWidth":2,"pointSize":5,"scaleDistribution":{"type":"linear"},"showPoints":"never","spanNulls":true,"stacking":{"group":"A","mode":"normal"},"thresholdsStyle":{"mode":"off"}},"links":[],"mappings":[],"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":80}]},"unit":"pps","unitScale":true},"overrides":[]},"gridPos":{"h":7,"w":12,"x":12,"y":28},"id":24,"links":[],"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"10.3.3","targets":[{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_cache_hits_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (type)","hide":false,"intervalFactor":2,"legendFormat":"hits:{{ type }}","refId":"A","step":40},{"datasource":{"uid":"$datasource"},"expr":"sum(rate(coredns_cache_misses_total{job=~\"$job\",cluster=~\"$cluster\",instance=~\"$instance\"}[5m])) by (type)","hide":false,"intervalFactor":2,"legendFormat":"misses","refId":"B","step":40}],"title":"Cache (hitrate)","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["dns","coredns"],"templating":{"list":[{"current":{},"hide":0,"includeAll":false,"multi":false,"name":"datasource","options":[],"query":"prometheus","queryValue":"","refresh":1,"regex":"","skipUrlSync":false,"type":"datasource"},{"allValue":".*","current":{"selected":false,"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"$datasource"},"definition":"label_values(coredns_dns_requests_total, cluster)","hide":2,"includeAll":true,"label":"Cluster","multi":false,"name":"cluster","options":[],"query":"label_values(coredns_dns_requests_total, cluster)","refresh":2,"regex":"","skipUrlSync":false,"sort":1,"tagValuesQuery":"","tagsQuery":"","type":"query","useTags":false},{"allValue":".*","current":{"selected":false,"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"${datasource}"},"definition":"label_values(coredns_dns_requests_total{cluster=~\"$cluster\"},job)","hide":0,"includeAll":true,"label":"Job","multi":false,"name":"job","options":[],"query":{"qryType":1,"query":"label_values(coredns_dns_requests_total{cluster=~\"$cluster\"},job)","refId":"PrometheusVariableQueryEditor-VariableQuery"},"refresh":2,"regex":"","skipUrlSync":false,"sort":1,"type":"query"},{"allValue":".*","current":{"selected":false,"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"$datasource"},"definition":"label_values(coredns_dns_requests_total{job=~\"$job\",cluster=~\"$cluster\"}, instance)","hide":0,"includeAll":true,"label":"Instance","multi":false,"name":"instance","options":[],"query":"label_values(coredns_dns_requests_total{job=~\"$job\",cluster=~\"$cluster\"}, instance)","refresh":2,"regex":"","skipUrlSync":false,"sort":3,"tagValuesQuery":"","tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-3h","to":"now"},"timepicker":{"refresh_intervals":["10s","30s","1m","5m","15m","30m","1h","2h","1d"]},"timezone": "utc","title":"CoreDNS","uid":"vkQ0UHxik","version":3,"weekStart":""} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-cluster - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-cluster.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"cluster:node_cpu:ratio_rate5m{cluster=\"$cluster\"}","instant":true}],"title":"CPU Utilisation","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":4,"y":0},"id":2,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\",cluster=\"$cluster\"})","instant":true}],"title":"CPU Requests Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":8,"y":0},"id":3,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\",cluster=\"$cluster\"})","instant":true}],"title":"CPU Limits Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":12,"y":0},"id":4,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{job=\"node-exporter\",cluster=\"$cluster\"})","instant":true}],"title":"Memory Utilisation","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":16,"y":0},"id":5,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\",cluster=\"$cluster\"})","instant":true}],"title":"Memory Requests Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":20,"y":0},"id":6,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\",cluster=\"$cluster\"})","instant":true}],"title":"Memory Limits Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true}}},"gridPos":{"h":6,"w":24,"x":0,"y":6},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\"}) by (namespace)","legendFormat":"__auto"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Namespace"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"}]}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":12},"id":8,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_owner{job=\"kube-state-metrics\", cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"count(avg(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"namespace","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Time 7":6,"Value #A":8,"Value #B":9,"Value #C":10,"Value #D":11,"Value #E":12,"Value #F":13,"Value #G":14,"namespace":7},"renameByName":{"Value #A":"Pods","Value #B":"Workloads","Value #C":"CPU Usage","Value #D":"CPU Requests","Value #E":"CPU Requests %","Value #F":"CPU Limits","Value #G":"CPU Limits %","namespace":"Namespace"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":6,"w":24,"x":0,"y":18},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace)","legendFormat":"__auto"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Memory Usage"},"properties":[{"id":"unit","value":"bytes"}]},{"matcher":{"id":"byName","options":"Memory Requests"},"properties":[{"id":"unit","value":"bytes"}]},{"matcher":{"id":"byName","options":"Memory Limits"},"properties":[{"id":"unit","value":"bytes"}]},{"matcher":{"id":"byName","options":"Namespace"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"}]}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":24},"id":10,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_owner{job=\"kube-state-metrics\", cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"count(avg(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)","format":"table","instant":true}],"title":"Memory Requests by Namespace","transformations":[{"id":"joinByField","options":{"byField":"namespace","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Time 7":6,"Value #A":8,"Value #B":9,"Value #C":10,"Value #D":11,"Value #E":12,"Value #F":13,"Value #G":14,"namespace":7},"renameByName":{"Value #A":"Pods","Value #B":"Workloads","Value #C":"Memory Usage","Value #D":"Memory Requests","Value #E":"Memory Requests %","Value #F":"Memory Limits","Value #G":"Memory Limits %","namespace":"Namespace"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bandwidth/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Namespace"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"}]}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":30},"id":11,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","format":"table","instant":true}],"title":"Current Network Usage","transformations":[{"id":"joinByField","options":{"byField":"namespace","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"namespace":6},"renameByName":{"Value #A":"Current Receive Bandwidth","Value #B":"Current Transmit Bandwidth","Value #C":"Rate of Received Packets","Value #D":"Rate of Transmitted Packets","Value #E":"Rate of Received Packets Dropped","Value #F":"Rate of Transmitted Packets Dropped","namespace":"Namespace"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":6,"w":24,"x":0,"y":36},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":6,"w":24,"x":0,"y":42},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":6,"w":24,"x":0,"y":48},"id":14,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"avg(irate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Namespace: Received","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":6,"w":24,"x":0,"y":54},"id":15,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"avg(irate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Namespace: Transmitted","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":6,"w":24,"x":0,"y":60},"id":16,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":6,"w":24,"x":0,"y":66},"id":17,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":6,"w":24,"x":0,"y":72},"id":18,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":6,"w":24,"x":0,"y":78},"id":19,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"iops"}},"gridPos":{"h":6,"w":24,"x":0,"y":84},"id":20,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))","legendFormat":"__auto"}],"title":"IOPS(Reads+Writes)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":6,"w":24,"x":0,"y":90},"id":21,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","legendFormat":"__auto"}],"title":"ThroughPut(Read+Write)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/IOPS/"},"properties":[{"id":"unit","value":"iops"}]},{"matcher":{"id":"byRegexp","options":"/Throughput/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byName","options":"Namespace"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?${datasource:queryparam}&var-cluster=$cluster&var-namespace=${__data.fields.Namespace}"}]}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":96},"id":22,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))","format":"table","instant":true}],"title":"Current Storage IO","transformations":[{"id":"joinByField","options":{"byField":"namespace","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"namespace":6},"renameByName":{"Value #A":"IOPS(Reads)","Value #B":"IOPS(Writes)","Value #C":"IOPS(Reads + Writes)","Value #D":"Throughput(Read)","Value #E":"Throughput(Write)","Value #F":"Throughput(Read + Write)","namespace":"Namespace"}}}],"type":"table"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Cluster","uid":"efa86fd1d0c121a26444b636a3f509a8"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-multicluster.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-multicluster - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-multicluster.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":3,"w":4,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:node_cpu:ratio_rate5m) / count(cluster:node_cpu:ratio_rate5m)","instant":true}],"title":"CPU Utilisation","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":4,"y":0},"id":2,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\", resource=\"cpu\"})","instant":true}],"title":"CPU Requests Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":8,"y":0},"id":3,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\", resource=\"cpu\"})","instant":true}],"title":"CPU Limits Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":12,"y":0},"id":4,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"1 - sum(:node_memory_MemAvailable_bytes:sum) / sum(node_memory_MemTotal_bytes{job=\"node-exporter\"})","instant":true}],"title":"Memory Utilisation","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":16,"y":0},"id":5,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", resource=\"memory\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\", resource=\"memory\"})","instant":true}],"title":"Memory Requests Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":4,"x":20,"y":0},"id":6,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", resource=\"memory\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\", resource=\"memory\"})","instant":true}],"title":"Memory Limits Commitment","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"}}},"gridPos":{"h":7,"w":24,"x":0,"y":1},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (cluster)","legendFormat":"__auto"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Cluster"},"properties":[{"id":"links","value":[{"title":"Drill down","url":"/d/efa86fd1d0c121a26444b636a3f509a8/kubernetes-compute-resources-cluster?${datasource:queryparam}&var-cluster=${__data.fields.Cluster}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":2},"id":8,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", resource=\"cpu\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (cluster) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", resource=\"cpu\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", resource=\"cpu\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m) by (cluster) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", resource=\"cpu\"}) by (cluster)","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"cluster","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Value #A":6,"Value #B":7,"Value #C":8,"Value #D":9,"Value #E":10,"cluster":5},"renameByName":{"Value #A":"CPU Usage","Value #B":"CPU Requests","Value #C":"CPU Requests %","Value #D":"CPU Limits","Value #E":"CPU Limits %","cluster":"Cluster"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"bytes"}},"gridPos":{"h":7,"w":24,"x":0,"y":3},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}) by (cluster)","legendFormat":"__auto"}],"title":"Memory Usage (w/o cache)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"},"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Cluster"},"properties":[{"id":"links","value":[{"title":"Drill down","url":"/d/efa86fd1d0c121a26444b636a3f509a8/kubernetes-compute-resources-cluster?${datasource:queryparam}&var-cluster=${__data.fields.Cluster}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":4},"id":10,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", resource=\"memory\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}) by (cluster) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", resource=\"memory\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", resource=\"memory\"}) by (cluster)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\"}) by (cluster) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", resource=\"memory\"}) by (cluster)","format":"table","instant":true}],"title":"Memory Requests by Cluster","transformations":[{"id":"joinByField","options":{"byField":"cluster","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Value #A":6,"Value #B":7,"Value #C":8,"Value #D":9,"Value #E":10,"cluster":5},"renameByName":{"Value #A":"Memory Usage","Value #B":"Memory Requests","Value #C":"Memory Requests %","Value #D":"Memory Limits","Value #E":"Memory Limits %","cluster":"Cluster"}}}],"type":"table"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Multi-Cluster","uid":"b59e6c9f2fcbe2e16d77fc492374cc4f"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-namespace - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-namespace.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":6,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})","instant":true}],"title":"CPU Utilisation (from requests)","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":6,"x":6,"y":0},"id":2,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})","instant":true}],"title":"CPU Utilisation (from limits)","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":6,"x":12,"y":0},"id":3,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})","instant":true}],"title":"Memory Utilisation (from requests)","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"percentunit"}},"gridPos":{"h":3,"w":6,"x":18,"y":0},"id":4,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})","instant":true}],"title":"Memory Utilisation (from limits)","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true}},"overrides":[{"matcher":{"id":"byFrameRefID","options":"B"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byFrameRefID","options":"C"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"orange","mode":"fixed"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":7},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","legendFormat":"__auto"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"}))","legendFormat":"quota - requests"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"}))","legendFormat":"quota - limits"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":6,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Value #A":6,"Value #B":7,"Value #C":8,"Value #D":9,"Value #E":10,"pod":5},"renameByName":{"Value #A":"CPU Usage","Value #B":"CPU Requests","Value #C":"CPU Requests %","Value #D":"CPU Limits","Value #E":"CPU Limits %","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"},"overrides":[{"matcher":{"id":"byFrameRefID","options":"B"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byFrameRefID","options":"C"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"orange","mode":"fixed"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":21},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}) by (pod)","legendFormat":"__auto"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"}))","legendFormat":"quota - requests"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"}))","legendFormat":"quota - limits"}],"title":"Memory Usage (w/o cache)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"},"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":28},"id":8,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_cache{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_swap{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)","format":"table","instant":true}],"title":"Memory Quota","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true,"Time 8":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Time 7":6,"Time 8":7,"Value #A":9,"Value #B":10,"Value #C":11,"Value #D":12,"Value #E":13,"Value #F":14,"Value #G":15,"Value #H":16,"pod":8},"renameByName":{"Value #A":"Memory Usage","Value #B":"Memory Requests","Value #C":"Memory Requests %","Value #D":"Memory Limits","Value #E":"Memory Limits %","Value #F":"Memory Usage (RSS)","Value #G":"Memory Usage (Cache)","Value #H":"Memory Usage (Swap)","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bandwidth/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":35},"id":9,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","format":"table","instant":true}],"title":"Current Network Usage","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"pod":6},"renameByName":{"Value #A":"Current Receive Bandwidth","Value #B":"Current Transmit Bandwidth","Value #C":"Rate of Received Packets","Value #D":"Rate of Transmitted Packets","Value #E":"Rate of Received Packets Dropped","Value #F":"Rate of Transmitted Packets Dropped","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":42},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":42},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":49},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":49},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":56},"id":14,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":56},"id":15,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"iops"}},"gridPos":{"h":7,"w":12,"x":0,"y":63},"id":16,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))","legendFormat":"__auto"}],"title":"IOPS(Reads+Writes)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":63},"id":17,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","legendFormat":"__auto"}],"title":"ThroughPut(Read+Write)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/IOPS/"},"properties":[{"id":"unit","value":"iops"}]},{"matcher":{"id":"byRegexp","options":"/Throughput/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":70},"id":18,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))","format":"table","instant":true}],"title":"Current Storage IO","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"pod":6},"renameByName":{"Value #A":"IOPS(Reads)","Value #B":"IOPS(Writes)","Value #C":"IOPS(Reads + Writes)","Value #D":"Throughput(Read)","Value #E":"Throughput(Write)","Value #F":"Throughput(Read + Write)","pod":"Pod"}}}],"type":"table"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"namespace","name":"namespace","query":"label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Namespace (Pods)","uid":"85a562078cdf77779eaa1add43ccec1e"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-node - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-node.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true,"stacking":{"mode":"normal"}}},"overrides":[{"matcher":{"id":"byName","options":"max capacity"},"properties":[{"id":"color","value":{"fixedColor":"red","mode":"fixed"}},{"id":"custom.stacking","value":{"mode":"none"}},{"id":"custom.hideFrom","value":{"legend":false,"tooltip":true,"viz":false}},{"id":"custom.lineStyle","value":{"dash":[10,10],"fill":"dash"}}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":0},"id":1,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_node_status_capacity{cluster=\"$cluster\", job=\"kube-state-metrics\", node=~\"$node\", resource=\"cpu\"})","legendFormat":"max capacity"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","legendFormat":"{{pod}}"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":6},"id":2,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"renameByName":{"Value #A":"CPU Usage","Value #B":"CPU Requests","Value #C":"CPU Requests %","Value #D":"CPU Limits","Value #E":"CPU Limits %","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true,"stacking":{"mode":"normal"}},"unit":"bytes"},"overrides":[{"matcher":{"id":"byName","options":"max capacity"},"properties":[{"id":"color","value":{"fixedColor":"red","mode":"fixed"}},{"id":"custom.stacking","value":{"mode":"none"}},{"id":"custom.hideFrom","value":{"legend":false,"tooltip":true,"viz":false}},{"id":"custom.lineStyle","value":{"dash":[10,10],"fill":"dash"}}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":12},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_node_status_capacity{cluster=\"$cluster\", job=\"kube-state-metrics\", node=~\"$node\", resource=\"memory\"})","legendFormat":"max capacity"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)","legendFormat":"{{pod}}"}],"title":"Memory Usage (w/cache)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true,"stacking":{"mode":"normal"}},"unit":"bytes"},"overrides":[{"matcher":{"id":"byName","options":"max capacity"},"properties":[{"id":"color","value":{"fixedColor":"red","mode":"fixed"}},{"id":"custom.stacking","value":{"mode":"none"}},{"id":"custom.hideFrom","value":{"legend":false,"tooltip":true,"viz":false}},{"id":"custom.lineStyle","value":{"dash":[10,10],"fill":"dash"}}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":18},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kube_node_status_capacity{cluster=\"$cluster\", job=\"kube-state-metrics\", node=~\"$node\", resource=\"memory\"})","legendFormat":"max capacity"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)","legendFormat":"{{pod}}"}],"title":"Memory Usage (w/o cache)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"},"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":6,"w":24,"x":0,"y":24},"id":5,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_cache{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_memory_swap{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)","format":"table","instant":true}],"title":"Memory Quota","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true,"Time 8":true},"renameByName":{"Value #A":"Memory Usage","Value #B":"Memory Requests","Value #C":"Memory Requests %","Value #D":"Memory Limits","Value #E":"Memory Limits %","Value #F":"Memory Usage (RSS)","Value #G":"Memory Usage (Cache)","Value #H":"Memory Usage (Swap)","pod":"Pod"}}}],"type":"table"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"node","multi":true,"name":"node","query":"label_values(kube_node_info{cluster=\"$cluster\"}, node)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Node (Pods)","uid":"200ac8fdbfbb74b39aff88118e4d1c2c"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-pod - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-pod.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true}},"overrides":[{"matcher":{"id":"byFrameRefID","options":"B"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byFrameRefID","options":"C"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"orange","mode":"fixed"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":0},"id":1,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\", container!=\"\"}) by (container)","legendFormat":"__auto"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n","legendFormat":"requests"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n","legendFormat":"limits"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"axisColorMode":"thresholds","axisSoftMax":1,"axisSoftMin":0,"fillOpacity":10,"showPoints":"never","spanNulls":true,"thresholdsStyle":{"mode":"dashed+area"}},"unit":"percentunit"},"overrides":[{"matcher":{"id":"byFrameRefID","options":"A"},"properties":[{"id":"thresholds","value":{"mode":"absolute","steps":[{"color":"green","value":null},{"color":"red","value":0.25}]}},{"id":"color","value":{"mode":"thresholds","seriesBy":"lastNotNull"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":7},"id":2,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(increase(container_cpu_cfs_throttled_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container) /sum(increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)","legendFormat":"__auto"}],"title":"CPU Throttling","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":3,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"container","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Value #A":6,"Value #B":7,"Value #C":8,"Value #D":9,"Value #E":10,"container":5},"renameByName":{"Value #A":"CPU Usage","Value #B":"CPU Requests","Value #C":"CPU Requests %","Value #D":"CPU Limits","Value #E":"CPU Limits %","container":"Container"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"},"overrides":[{"matcher":{"id":"byFrameRefID","options":"B"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byFrameRefID","options":"C"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"orange","mode":"fixed"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":21},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)","legendFormat":"__auto"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n","legendFormat":"requests"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n","legendFormat":"limits"}],"title":"Memory Usage (WSS)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"},"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":28},"id":5,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_rss{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_cache{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(container_memory_swap{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container != \"\", container != \"POD\"}) by (container)","format":"table","instant":true}],"title":"Memory Quota","transformations":[{"id":"joinByField","options":{"byField":"container","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true,"Time 8":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Time 7":6,"Time 8":7,"Value #A":9,"Value #B":10,"Value #C":11,"Value #D":12,"Value #E":13,"Value #F":14,"Value #G":15,"Value #H":16,"container":8},"renameByName":{"Value #A":"Memory Usage","Value #B":"Memory Requests","Value #C":"Memory Requests %","Value #D":"Memory Limits","Value #E":"Memory Limits %","Value #F":"Memory Usage (RSS)","Value #G":"Memory Usage (Cache)","Value #H":"Memory Usage (Swap)","container":"Container"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":35},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(irate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":35},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":42},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":42},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":49},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":49},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"iops"}},"gridPos":{"h":7,"w":12,"x":0,"y":56},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))","legendFormat":"Reads"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))","legendFormat":"Writes"}],"title":"IOPS (Pod)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":56},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))","legendFormat":"Reads"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))","legendFormat":"Writes"}],"title":"ThroughPut (Pod)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"iops"}},"gridPos":{"h":7,"w":12,"x":0,"y":63},"id":14,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"ceil(sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval])))","legendFormat":"__auto"}],"title":"IOPS (Containers)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":63},"id":15,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","legendFormat":"__auto"}],"title":"ThroughPut (Containers)","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/IOPS/"},"properties":[{"id":"unit","value":"iops"}]},{"matcher":{"id":"byRegexp","options":"/Throughput/"},"properties":[{"id":"unit","value":"Bps"}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":70},"id":16,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))","format":"table","instant":true}],"title":"Current Storage IO","transformations":[{"id":"joinByField","options":{"byField":"container","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"container":6},"renameByName":{"Value #A":"IOPS(Reads)","Value #B":"IOPS(Writes)","Value #C":"IOPS(Reads + Writes)","Value #D":"Throughput(Read)","Value #E":"Throughput(Write)","Value #F":"Throughput(Read + Write)","container":"Container"}}}],"type":"table"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"namespace","name":"namespace","query":"label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"pod","name":"pod","query":"label_values(kube_pod_info{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, pod)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Pod","uid":"6581e46e4e5c7ba40a07646395ef7b23"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-workload - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-workload.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true}}},"gridPos":{"h":7,"w":24,"x":0,"y":0},"id":1,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","legendFormat":"__auto"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":7},"id":2,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Value #A":6,"Value #B":7,"Value #C":8,"Value #D":9,"Value #E":10,"pod":5},"renameByName":{"Value #A":"CPU Usage","Value #B":"CPU Requests","Value #C":"CPU Requests %","Value #D":"CPU Limits","Value #E":"CPU Limits %","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","legendFormat":"__auto"}],"title":"Memory Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"},"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":21},"id":4,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=~\"$type\"}\n) by (pod)\n","format":"table","instant":true}],"title":"Memory Quota","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Value #A":9,"Value #B":10,"Value #C":11,"Value #D":12,"Value #E":13,"pod":8},"renameByName":{"Value #A":"Memory Usage","Value #B":"Memory Requests","Value #C":"Memory Requests %","Value #D":"Memory Limits","Value #E":"Memory Limits %","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bandwidth/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down to pods","url":"/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":28},"id":5,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","format":"table","instant":true}],"title":"Current Network Usage","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"pod":6},"renameByName":{"Value #A":"Current Receive Bandwidth","Value #B":"Current Transmit Bandwidth","Value #C":"Rate of Received Packets","Value #D":"Rate of Transmitted Packets","Value #E":"Rate of Received Packets Dropped","Value #F":"Rate of Transmitted Packets Dropped","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":35},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":35},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":42},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(avg(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Pod: Received","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":42},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(avg(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Pod: Transmitted","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":49},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":49},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":56},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":56},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"namespace","name":"namespace","query":"label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"workload_type","name":"type","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload_type)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"workload","name":"workload","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}, workload)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Workload","uid":"a164a7f0339f99e89cea5cb47e9be617"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-k8s-resources-workloads-namespace - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - k8s-resources-workloads-namespace.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true}},"overrides":[{"matcher":{"id":"byFrameRefID","options":"B"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byFrameRefID","options":"C"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"orange","mode":"fixed"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":0},"id":1,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","legendFormat":"{{workload}} - {{workload_type}}"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=~\"requests.cpu|cpu\"}))","legendFormat":"quota - requests"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=~\"limits.cpu\"}))","legendFormat":"quota - limits"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Workload"},"properties":[{"id":"links","value":[{"title":"Drill down to workloads","url":"/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-type=${__data.fields.Type}&var-workload=${__data.fields.Workload}"}]}]},{"matcher":{"id":"byName","options":"Running Pods"},"properties":[{"id":"unit","value":"none"}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":7},"id":2,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"count(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload, workload_type)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true}],"title":"CPU Quota","transformations":[{"id":"joinByField","options":{"byField":"workload","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"workload_type 2":true,"workload_type 3":true,"workload_type 4":true,"workload_type 5":true,"workload_type 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":8,"Value #B":9,"Value #C":10,"Value #D":11,"Value #E":12,"Value #F":13,"workload":6,"workload_type 1":7,"workload_type 2":14,"workload_type 3":15,"workload_type 4":16,"workload_type 5":17,"workload_type 6":18},"renameByName":{"Value #A":"Running Pods","Value #B":"CPU Usage","Value #C":"CPU Requests","Value #D":"CPU Requests %","Value #E":"CPU Limits","Value #F":"CPU Limits %","workload":"Workload","workload_type 1":"Type"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"},"overrides":[{"matcher":{"id":"byFrameRefID","options":"B"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byFrameRefID","options":"C"},"properties":[{"id":"custom.lineStyle","value":{"fill":"dash"}},{"id":"custom.lineWidth","value":2},{"id":"color","value":{"fixedColor":"orange","mode":"fixed"}}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","legendFormat":"{{workload}} - {{workload_type}}"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=~\"requests.memory|memory\"}))","legendFormat":"quota - requests"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"scalar(max(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=~\"limits.memory\"}))","legendFormat":"quota - limits"}],"title":"Memory Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"bytes"},"overrides":[{"matcher":{"id":"byRegexp","options":"/%/"},"properties":[{"id":"unit","value":"percentunit"}]},{"matcher":{"id":"byName","options":"Workload"},"properties":[{"id":"links","value":[{"title":"Drill down to workloads","url":"/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-type=${__data.fields.Type}&var-workload=${__data.fields.Workload}"}]}]},{"matcher":{"id":"byName","options":"Running Pods"},"properties":[{"id":"unit","value":"none"}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":21},"id":4,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"count(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload, workload_type)","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(\n container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}\n) by (workload, workload_type)\n","format":"table","instant":true}],"title":"Memory Quota","transformations":[{"id":"joinByField","options":{"byField":"workload","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"workload_type 2":true,"workload_type 3":true,"workload_type 4":true,"workload_type 5":true,"workload_type 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":8,"Value #B":9,"Value #C":10,"Value #D":11,"Value #E":12,"Value #F":13,"workload":6,"workload_type 1":7,"workload_type 2":14,"workload_type 3":15,"workload_type 4":16,"workload_type 5":17,"workload_type 6":18},"renameByName":{"Value #A":"Running Pods","Value #B":"Memory Usage","Value #C":"Memory Requests","Value #D":"Memory Requests %","Value #E":"Memory Limits","Value #F":"Memory Limits %","workload":"Workload","workload_type 1":"Type"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bandwidth/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Workload"},"properties":[{"id":"links","value":[{"title":"Drill down to workloads","url":"/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?${datasource:queryparam}&var-cluster=$cluster&var-namespace=$namespace&var-type=${__data.fields.Type}&var-workload=${__data.fields.Workload}"}]}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":28},"id":5,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=~\"$type\"}) by (workload))\n","format":"table","instant":true}],"title":"Current Network Usage","transformations":[{"id":"joinByField","options":{"byField":"workload","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"workload":6},"renameByName":{"Value #A":"Current Receive Bandwidth","Value #B":"Current Transmit Bandwidth","Value #C":"Rate of Received Packets","Value #D":"Rate of Transmitted Packets","Value #E":"Rate of Received Packets Dropped","Value #F":"Rate of Transmitted Packets Dropped","workload":"Workload"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":35},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":35},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":42},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(avg(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Workload: Received","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":42},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(avg(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Workload: Transmitted","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":49},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":49},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":0,"y":56},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":7,"w":12,"x":12,"y":56},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"namespace","name":"namespace","query":"label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"workload_type","name":"type","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\"}, workload_type)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Compute Resources / Namespace (Workloads)","uid":"a87fb0d919ec0ea5f6543124e16c42a5"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-kubelet - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - kubelet.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})","instant":true}],"title":"Running Kubelets","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":4,"y":0},"id":2,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})","instant":true}],"title":"Running Pods","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":8,"y":0},"id":3,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})","instant":true}],"title":"Running Containers","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":12,"y":0},"id":4,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})","instant":true}],"title":"Actual Volume Count","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":16,"y":0},"id":5,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})","instant":true}],"title":"Desired Volume Count","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":20,"y":0},"id":6,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[$__rate_interval]))","instant":true}],"title":"Config Error Count","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":7},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (operation_type, instance)","legendFormat":"{{instance}} {{operation_type}}"}],"title":"Operation Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":12,"y":7},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type)","legendFormat":"{{instance}} {{operation_type}}"}],"title":"Operation Error Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type, le))","legendFormat":"{{instance}} {{operation_type}}"}],"title":"Operation Duration 99th quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":21},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance)","legendFormat":"{{instance}} pod"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance)","legendFormat":"{{instance}} worker"}],"title":"Pod Start Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":12,"x":12,"y":21},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}} pod"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}} worker"}],"title":"Pod Start Duration","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":28},"id":12,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_name, volume_plugin)","legendFormat":"{{instance}} {{operation_name}} {{volume_plugin}}"}],"title":"Storage Operation Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":12,"y":28},"id":13,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_name, volume_plugin)","legendFormat":"{{instance}} {{operation_name}} {{volume_plugin}}"}],"title":"Storage Operation Error Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":35},"id":14,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_name, volume_plugin, le))","legendFormat":"{{instance}} {{operation_name}} {{volume_plugin}}"}],"title":"Storage Operation Duration 99th quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":42},"id":15,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type)","legendFormat":"{{operation_type}}"}],"title":"Cgroup manager operation rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":12,"x":12,"y":42},"id":16,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[$__rate_interval])) by (instance, operation_type, le))","legendFormat":"{{instance}} {{operation_type}}"}],"title":"Cgroup manager 99th quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":49},"id":17,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[$__rate_interval])) by (instance)","legendFormat":"{{instance}}"}],"title":"PLEG relist rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":12,"x":12,"y":49},"id":18,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}}"}],"title":"PLEG relist interval","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":56},"id":19,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}}"}],"title":"PLEG relist duration","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":24,"x":0,"y":63},"id":20,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))","legendFormat":"2xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))","legendFormat":"3xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))","legendFormat":"4xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))","legendFormat":"5xx"}],"title":"RPC rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":70},"id":21,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[$__rate_interval])) by (instance, verb, le))","legendFormat":"{{instance}} {{verb}}"}],"title":"Request duration 99th quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":0,"y":77},"id":22,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":8,"y":77},"id":23,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[$__rate_interval])","legendFormat":"{{instance}}"}],"title":"CPU usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":16,"y":77},"id":24,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Goroutines","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"instance","name":"instance","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics\",cluster=\"$cluster\"}, instance)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Kubelet","uid":"3138fa155d5915769fbded898ac09fd9"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-namespace-by-pod - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - namespace-by-pod.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"displayName":"$namespace","max":10000000000,"min":0,"thresholds":{"steps":[{"color":"dark-green","index":0,"value":null},{"color":"dark-yellow","index":1,"value":5000000000},{"color":"dark-red","index":2,"value":7000000000}]},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":0},"id":1,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Received","type":"gauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"displayName":"$namespace","max":10000000000,"min":0,"thresholds":{"steps":[{"color":"dark-green","index":0,"value":null},{"color":"dark-yellow","index":1,"value":5000000000},{"color":"dark-red","index":2,"value":7000000000}]},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":0},"id":2,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Transmitted","type":"gauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bandwidth/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Pod"},"properties":[{"id":"links","value":[{"title":"Drill down","url":"/d/7a18067ce943a40ae25454675c19ff5c/kubernetes-networking-pod?${datasource:queryparam}&var-cluster=${cluster}&var-namespace=${namespace}&var-pod=${__data.fields.Pod}"}]}]}]},"gridPos":{"h":9,"w":24,"x":0,"y":9},"id":3,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","format":"table","instant":true}],"title":"Current Network Usage","transformations":[{"id":"joinByField","options":{"byField":"pod","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Value #A":7,"Value #B":8,"Value #C":9,"Value #D":10,"Value #E":11,"Value #F":12,"pod":6},"renameByName":{"Value #A":"Current Receive Bandwidth","Value #B":"Current Transmit Bandwidth","Value #C":"Rate of Received Packets","Value #D":"Rate of Transmitted Packets","Value #E":"Rate of Received Packets Dropped","Value #F":"Rate of Transmitted Packets Dropped","pod":"Pod"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":0,"y":18},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":12,"y":18},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":27},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":27},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":36},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace!=\"\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":36},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum by (pod) (\n rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n * on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n)\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"allValue":".+","current":{"selected":false,"text":"kube-system","value":"kube-system"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"namespace","name":"namespace","query":"label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Networking / Namespace (Pods)","uid":"8b7a8b326d7a6f1f04244066368c67af"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-namespace-by-workload - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - namespace-by-workload.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":0},"id":1,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Received","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":0},"id":2,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Transmitted","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"overrides":[{"matcher":{"id":"byRegexp","options":"/Bytes/"},"properties":[{"id":"unit","value":"binBps"}]},{"matcher":{"id":"byRegexp","options":"/Packets/"},"properties":[{"id":"unit","value":"pps"}]},{"matcher":{"id":"byName","options":"Workload"},"properties":[{"id":"links","value":[{"title":"Drill down","url":"/d/728bf77cc1166d2f3133bf25846876cc/kubernetes-networking-workload?${datasource:queryparam}&var-cluster=${cluster}&var-namespace=${namespace}&var-type=${__data.fields.Type}&var-workload=${__data.fields.Workload}"}]}]}]},"gridPos":{"h":9,"w":24,"x":0,"y":9},"id":3,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (namespace,pod) kube_pod_info{cluster=\"$cluster\",namespace=\"$namespace\",host_network=\"false\"}\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload, workload_type))\n","format":"table","instant":true}],"title":"Current Status","transformations":[{"id":"joinByField","options":{"byField":"workload","mode":"outer"}},{"id":"organize","options":{"excludeByName":{"Time":true,"Time 1":true,"Time 2":true,"Time 3":true,"Time 4":true,"Time 5":true,"Time 6":true,"Time 7":true,"Time 8":true,"workload_type 2":true,"workload_type 3":true,"workload_type 4":true,"workload_type 5":true,"workload_type 6":true,"workload_type 7":true,"workload_type 8":true},"indexByName":{"Time 1":0,"Time 2":1,"Time 3":2,"Time 4":3,"Time 5":4,"Time 6":5,"Time 7":6,"Time 8":7,"Value #A":10,"Value #B":11,"Value #C":12,"Value #D":13,"Value #E":14,"Value #F":15,"Value #G":16,"Value #H":17,"workload":8,"workload_type 1":9,"workload_type 2":18,"workload_type 3":19,"workload_type 4":20,"workload_type 5":21,"workload_type 6":22,"workload_type 7":23,"workload_type 8":24},"renameByName":{"Value #A":"Rx Bytes","Value #B":"Tx Bytes","Value #C":"Rx Bytes (Avg)","Value #D":"Tx Bytes (Avg)","Value #E":"Rx Packets","Value #F":"Tx Packets","Value #G":"Rx Packets Dropped","Value #H":"Tx Packets Dropped","workload":"Workload","workload_type 1":"Type"}}}],"type":"table"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":18},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":18},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":27},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Workload: Received","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":27},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Average Container Bandwidth by Workload: Transmitted","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":36},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":36},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":45},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":45},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])\n* on (cluster,namespace,pod) group_left ()\n topk by (cluster,namespace,pod) (\n 1,\n max by (cluster,namespace,pod) (kube_pod_info{host_network=\"false\"})\n )\n* on (cluster,namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"$namespace\", workload=~\".+\", workload_type=~\"$type\"}) by (workload))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"current":{"selected":false,"text":"kube-system","value":"kube-system"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"namespace","name":"namespace","query":"label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"workload_type","name":"type","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=~\".+\"}, workload_type)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Networking / Namespace (Workload)","uid":"bbb2a765a623ae38130206c7d94a160f"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-node-cluster-rsrc-use - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - node-cluster-rsrc-use.json: |- - {"graphTooltip":1,"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"CPU","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":1},"id":2,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"((\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n *\n instance:node_num_cpu:sum{job=\"node-exporter\", cluster=\"$cluster\"}\n) != 0 )\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\", cluster=\"$cluster\"}))\n","legendFormat":"{{ instance }}"}],"title":"CPU Utilisation","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":12,"y":1},"id":3,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n","legendFormat":"{{ instance }}"}],"title":"CPU Saturation (Load1 per CPU)","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":4,"panels":[],"title":"Memory","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":9},"id":5,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n","legendFormat":"{{ instance }}"}],"title":"Memory Utilisation","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"rds"}},"gridPos":{"h":7,"w":12,"x":12,"y":9},"id":6,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}","legendFormat":"{{ instance }}"}],"title":"Memory Saturation (Major Page Faults)","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":16},"id":7,"panels":[],"title":"Network","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"Bps"},"overrides":[{"matcher":{"id":"byRegexp","options":"/Transmit/"},"properties":[{"id":"custom.transform","value":"negative-Y"}]}]},"gridPos":{"h":7,"w":12,"x":0,"y":17},"id":8,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0","legendFormat":"{{ instance }} Receive"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0","legendFormat":"{{ instance }} Transmit"}],"title":"Network Utilisation (Bytes Receive/Transmit)","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"Bps"},"overrides":[{"matcher":{"id":"byRegexp","options":"/Transmit/"},"properties":[{"id":"custom.transform","value":"negative-Y"}]}]},"gridPos":{"h":7,"w":12,"x":12,"y":17},"id":9,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0","legendFormat":"{{ instance }} Receive"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0","legendFormat":"{{ instance }} Transmit"}],"title":"Network Saturation (Drops Receive/Transmit)","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":24},"id":10,"panels":[],"title":"Disk IO","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":25},"id":11,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}))\n","legendFormat":"{{ instance }} {{device}}"}],"title":"Disk IO Utilisation","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":12,"y":25},"id":12,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}))\n","legendFormat":"{{ instance }} {{device}}"}],"title":"Disk IO Saturation","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":34},"id":13,"panels":[],"title":"Disk Space","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":24,"x":0,"y":35},"id":14,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum without (device) (\n max without (fstype, mountpoint) ((\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\", cluster=\"$cluster\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\", cluster=\"$cluster\"}\n ) != 0)\n)\n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\", cluster=\"$cluster\"})))\n","legendFormat":"{{ instance }}"}],"title":"Disk Space Utilisation","type":"timeseries"}],"refresh":"30s","schemaVersion":39,"tags":["node-exporter-mixin"],"templating":{"list":[{"name":"datasource","query":"prometheus","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"includeAll":false,"name":"cluster","query":"label_values(node_time_seconds, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Node Exporter / USE Method / Cluster","uid":"3e97d1d02672cdd0861f4c97c64f89b2"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-node-rsrc-use - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - node-rsrc-use.json: |- - {"graphTooltip":1,"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"CPU","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":1},"id":2,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Utilisation"}],"title":"CPU Utilisation","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":12,"y":1},"id":3,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_load1_per_cpu:ratio{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Saturation"}],"title":"CPU Saturation (Load1 per CPU)","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":4,"panels":[],"title":"Memory","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":9},"id":5,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_memory_utilisation:ratio{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Utilisation"}],"title":"Memory Utilisation","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"rds"}},"gridPos":{"h":7,"w":12,"x":12,"y":9},"id":6,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Major page Faults"}],"title":"Memory Saturation (Major Page Faults)","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":16},"id":7,"panels":[],"title":"Network","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"Bps"},"overrides":[{"matcher":{"id":"byRegexp","options":"/Transmit/"},"properties":[{"id":"custom.transform","value":"negative-Y"}]}]},"gridPos":{"h":7,"w":12,"x":0,"y":17},"id":8,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Receive"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Transmit"}],"title":"Network Utilisation (Bytes Receive/Transmit)","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"Bps"},"overrides":[{"matcher":{"id":"byRegexp","options":"/Transmit/"},"properties":[{"id":"custom.transform","value":"negative-Y"}]}]},"gridPos":{"h":7,"w":12,"x":12,"y":17},"id":9,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Receive"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"Transmit"}],"title":"Network Saturation (Drops Receive/Transmit)","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":24},"id":10,"panels":[],"title":"Disk IO","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":25},"id":11,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"{{device}}"}],"title":"Disk IO Utilisation","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":12,"y":25},"id":12,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0","legendFormat":"{{device}}"}],"title":"Disk IO Saturation","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":34},"id":13,"panels":[],"title":"Disk Space","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"showPoints":"never","stacking":{"mode":"normal"}},"unit":"percentunit"}},"gridPos":{"h":7,"w":24,"x":0,"y":35},"id":14,"options":{"legend":{"showLegend":false},"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sort_desc(1 -\n (\n max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n /\n max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n ) != 0\n)\n","legendFormat":"{{device}}"}],"title":"Disk Space Utilisation","type":"timeseries"}],"refresh":"30s","schemaVersion":39,"tags":["node-exporter-mixin"],"templating":{"list":[{"name":"datasource","query":"prometheus","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"includeAll":false,"name":"cluster","query":"label_values(node_time_seconds, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"name":"instance","query":"label_values(node_exporter_build_info{job=\"node-exporter\", cluster=\"$cluster\"}, instance)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Node Exporter / USE Method / Node","uid":"fac67cfbe174d3ef53eb473d73d9212f"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes-aix.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-nodes-aix - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - nodes-aix.json: |- - {"graphTooltip":1,"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"CPU","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"max":1,"min":0,"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":1},"id":2,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n (1 - sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])))\n/ ignoring(cpu) group_left\n count without (cpu, mode) (node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\", cluster=\"$cluster\"})\n)\n","intervalFactor":5,"legendFormat":"{{cpu}}"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":12,"y":1},"id":3,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load1{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"1m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load5{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"5m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load15{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"15m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", mode=\"idle\"})","legendFormat":"logical cores"}],"title":"Load Average","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":4,"title":"Memory","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"none"}},"min":0,"unit":"bytes"}},"gridPos":{"h":7,"w":18,"x":0,"y":9},"id":5,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_total_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"Physical Memory"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n node_memory_total_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} -\n node_memory_available_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n)\n","legendFormat":"Memory Used"}],"title":"Memory Usage","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"max":100,"min":0,"thresholds":{"steps":[{"color":"rgba(50, 172, 45, 0.97)"},{"color":"rgba(237, 129, 40, 0.89)","value":80},{"color":"rgba(245, 54, 54, 0.9)","value":90}]},"unit":"percent"}},"gridPos":{"h":7,"w":6,"x":18,"y":9},"id":6,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"100 -\n(\n avg(node_memory_available_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}) /\n avg(node_memory_total_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"})\n * 100\n)\n"}],"title":"Memory Usage","type":"gauge"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":18},"id":7,"panels":[],"title":"Disk","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0},"overrides":[{"matcher":{"id":"byRegexp","options":"/ read| written/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/ io time/"},"properties":[{"id":"unit","value":"percentunit"}]}]},"gridPos":{"h":7,"w":12,"x":0,"y":19},"id":8,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} read"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} written"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} io time"}],"title":"Disk I/O","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"thresholds":{"steps":[{"color":"green"},{"color":"yellow","value":0.8},{"color":"red","value":0.9}]},"unit":"decbytes"},"overrides":[{"matcher":{"id":"byName","options":"Mounted on"},"properties":[{"id":"custom.width","value":260}]},{"matcher":{"id":"byName","options":"Size"},"properties":[{"id":"custom.width","value":93}]},{"matcher":{"id":"byName","options":"Used"},"properties":[{"id":"custom.width","value":72}]},{"matcher":{"id":"byName","options":"Available"},"properties":[{"id":"custom.width","value":88}]},{"matcher":{"id":"byName","options":"Used, %"},"properties":[{"id":"unit","value":"percentunit"},{"id":"custom.cellOptions","value":{"type":"gauge"}},{"id":"max","value":1},{"id":"min","value":0}]}]},"gridPos":{"h":7,"w":12,"x":12,"y":19},"id":9,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n","format":"table","instant":true,"legendFormat":""},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n","format":"table","instant":true,"legendFormat":""}],"title":"Disk Space Usage","transformations":[{"id":"groupBy","options":{"fields":{"Value #A":{"aggregations":["lastNotNull"],"operation":"aggregate"},"Value #B":{"aggregations":["lastNotNull"],"operation":"aggregate"},"mountpoint":{"aggregations":[],"operation":"groupby"}}}},{"id":"merge"},{"id":"calculateField","options":{"alias":"Used","binary":{"left":"Value #A (lastNotNull)","operator":"-","reducer":"sum","right":"Value #B (lastNotNull)"},"mode":"binary","reduce":{"reducer":"sum"}}},{"id":"calculateField","options":{"alias":"Used, %","binary":{"left":"Used","operator":"/","reducer":"sum","right":"Value #A (lastNotNull)"},"mode":"binary","reduce":{"reducer":"sum"}}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Value #A (lastNotNull)":"Size","Value #B (lastNotNull)":"Available","mountpoint":"Mounted on"}}},{"id":"sortBy","options":{"fields":{},"sort":[{"field":"Mounted on"}]}}],"type":"table"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":26},"id":10,"panels":[],"title":"Network","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"description":"Network received (bits/s)","fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0,"unit":"bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":27},"id":11,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8","intervalFactor":1,"legendFormat":"{{device}}"}],"title":"Network Received","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"description":"Network transmitted (bits/s)","fieldConfig":{"defaults":{"custom":{"fillOpacity":0},"min":0,"unit":"bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":27},"id":12,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8","intervalFactor":1,"legendFormat":"{{device}}"}],"title":"Network Transmitted","type":"timeseries"}],"refresh":"30s","schemaVersion":39,"tags":["node-exporter-mixin"],"templating":{"list":[{"name":"datasource","query":"prometheus","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"Cluster","name":"cluster","query":"label_values(node_uname_info{job=\"node-exporter\", sysname!=\"Darwin\"}, cluster)","refresh":2,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"label":"Instance","name":"instance","query":"label_values(node_uname_info{job=\"node-exporter\", cluster=\"$cluster\", sysname!=\"Darwin\"}, instance)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Node Exporter / AIX","uid":"7e0a61e486f727d763fb1d86fdd629c2"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes-darwin.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-nodes-darwin - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - nodes-darwin.json: |- - {"graphTooltip":1,"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"CPU","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"max":1,"min":0,"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":1},"id":2,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n (1 - sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])))\n/ ignoring(cpu) group_left\n count without (cpu, mode) (node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\", cluster=\"$cluster\"})\n)\n","intervalFactor":5,"legendFormat":"{{cpu}}"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":12,"y":1},"id":3,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load1{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"1m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load5{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"5m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load15{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"15m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", mode=\"idle\"})","legendFormat":"logical cores"}],"title":"Load Average","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":4,"title":"Memory","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"none"}},"min":0,"unit":"bytes"}},"gridPos":{"h":7,"w":18,"x":0,"y":9},"id":5,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_total_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"Physical Memory"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n node_memory_internal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} -\n node_memory_purgeable_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} +\n node_memory_wired_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} +\n node_memory_compressed_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n)\n","legendFormat":"Memory Used"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n node_memory_internal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} -\n node_memory_purgeable_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n)\n","legendFormat":"App Memory"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_wired_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"Wired Memory"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_compressed_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"Compressed"}],"title":"Memory Usage","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"max":100,"min":0,"thresholds":{"steps":[{"color":"rgba(50, 172, 45, 0.97)"},{"color":"rgba(237, 129, 40, 0.89)","value":80},{"color":"rgba(245, 54, 54, 0.9)","value":90}]},"unit":"percent"}},"gridPos":{"h":7,"w":6,"x":18,"y":9},"id":6,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n (\n avg(node_memory_internal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}) -\n avg(node_memory_purgeable_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}) +\n avg(node_memory_wired_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}) +\n avg(node_memory_compressed_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"})\n ) /\n avg(node_memory_total_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"})\n)\n*\n100\n"}],"title":"Memory Usage","type":"gauge"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":18},"id":7,"panels":[],"title":"Disk","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0},"overrides":[{"matcher":{"id":"byRegexp","options":"/ read| written/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/ io time/"},"properties":[{"id":"unit","value":"percentunit"}]}]},"gridPos":{"h":7,"w":12,"x":0,"y":19},"id":8,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} read"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} written"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} io time"}],"title":"Disk I/O","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"thresholds":{"steps":[{"color":"green"},{"color":"yellow","value":0.8},{"color":"red","value":0.9}]},"unit":"decbytes"},"overrides":[{"matcher":{"id":"byName","options":"Mounted on"},"properties":[{"id":"custom.width","value":260}]},{"matcher":{"id":"byName","options":"Size"},"properties":[{"id":"custom.width","value":93}]},{"matcher":{"id":"byName","options":"Used"},"properties":[{"id":"custom.width","value":72}]},{"matcher":{"id":"byName","options":"Available"},"properties":[{"id":"custom.width","value":88}]},{"matcher":{"id":"byName","options":"Used, %"},"properties":[{"id":"unit","value":"percentunit"},{"id":"custom.cellOptions","value":{"type":"gauge"}},{"id":"max","value":1},{"id":"min","value":0}]}]},"gridPos":{"h":7,"w":12,"x":12,"y":19},"id":9,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n","format":"table","instant":true,"legendFormat":""},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n","format":"table","instant":true,"legendFormat":""}],"title":"Disk Space Usage","transformations":[{"id":"groupBy","options":{"fields":{"Value #A":{"aggregations":["lastNotNull"],"operation":"aggregate"},"Value #B":{"aggregations":["lastNotNull"],"operation":"aggregate"},"mountpoint":{"aggregations":[],"operation":"groupby"}}}},{"id":"merge"},{"id":"calculateField","options":{"alias":"Used","binary":{"left":"Value #A (lastNotNull)","operator":"-","reducer":"sum","right":"Value #B (lastNotNull)"},"mode":"binary","reduce":{"reducer":"sum"}}},{"id":"calculateField","options":{"alias":"Used, %","binary":{"left":"Used","operator":"/","reducer":"sum","right":"Value #A (lastNotNull)"},"mode":"binary","reduce":{"reducer":"sum"}}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Value #A (lastNotNull)":"Size","Value #B (lastNotNull)":"Available","mountpoint":"Mounted on"}}},{"id":"sortBy","options":{"fields":{},"sort":[{"field":"Mounted on"}]}}],"type":"table"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":26},"id":10,"panels":[],"title":"Network","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"description":"Network received (bits/s)","fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0,"unit":"bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":27},"id":11,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8","intervalFactor":1,"legendFormat":"{{device}}"}],"title":"Network Received","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"description":"Network transmitted (bits/s)","fieldConfig":{"defaults":{"custom":{"fillOpacity":0},"min":0,"unit":"bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":27},"id":12,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8","intervalFactor":1,"legendFormat":"{{device}}"}],"title":"Network Transmitted","type":"timeseries"}],"refresh":"30s","schemaVersion":39,"tags":["node-exporter-mixin"],"templating":{"list":[{"name":"datasource","query":"prometheus","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"Cluster","name":"cluster","query":"label_values(node_uname_info{job=\"node-exporter\", sysname=\"Darwin\"}, cluster)","refresh":2,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"label":"Instance","name":"instance","query":"label_values(node_uname_info{job=\"node-exporter\", cluster=\"$cluster\", sysname=\"Darwin\"}, instance)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Node Exporter / MacOS","uid":"629701ea43bf69291922ea45f4a87d37"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-nodes - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - nodes.json: |- - {"graphTooltip":1,"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"CPU","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"max":1,"min":0,"unit":"percentunit"}},"gridPos":{"h":7,"w":12,"x":0,"y":1},"id":2,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n (1 - sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])))\n/ ignoring(cpu) group_left\n count without (cpu, mode) (node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\", cluster=\"$cluster\"})\n)\n","intervalFactor":5,"legendFormat":"{{cpu}}"}],"title":"CPU Usage","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":12,"y":1},"id":3,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load1{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"1m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load5{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"5m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_load15{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"15m load average"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", mode=\"idle\"})","legendFormat":"logical cores"}],"title":"Load Average","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":4,"title":"Memory","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"bytes"}},"gridPos":{"h":7,"w":18,"x":0,"y":9},"id":5,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}\n)\n","legendFormat":"memory used"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"memory buffers"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"memory cached"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}","legendFormat":"memory free"}],"title":"Memory Usage","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"max":100,"min":0,"thresholds":{"steps":[{"color":"rgba(50, 172, 45, 0.97)"},{"color":"rgba(237, 129, 40, 0.89)","value":80},{"color":"rgba(245, 54, 54, 0.9)","value":90}]},"unit":"percent"}},"gridPos":{"h":7,"w":6,"x":18,"y":9},"id":6,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"}) /\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"})\n* 100\n)\n"}],"title":"Memory Usage","type":"gauge"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":18},"id":7,"panels":[],"title":"Disk","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0},"overrides":[{"matcher":{"id":"byRegexp","options":"/ read| written/"},"properties":[{"id":"unit","value":"Bps"}]},{"matcher":{"id":"byRegexp","options":"/ io time/"},"properties":[{"id":"unit","value":"percentunit"}]}]},"gridPos":{"h":7,"w":12,"x":0,"y":19},"id":8,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} read"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} written"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[$__rate_interval])","intervalFactor":1,"legendFormat":"{{device}} io time"}],"title":"Disk I/O","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"fieldConfig":{"defaults":{"thresholds":{"steps":[{"color":"green"},{"color":"yellow","value":0.8},{"color":"red","value":0.9}]},"unit":"decbytes"},"overrides":[{"matcher":{"id":"byName","options":"Mounted on"},"properties":[{"id":"custom.width","value":260}]},{"matcher":{"id":"byName","options":"Size"},"properties":[{"id":"custom.width","value":93}]},{"matcher":{"id":"byName","options":"Used"},"properties":[{"id":"custom.width","value":72}]},{"matcher":{"id":"byName","options":"Available"},"properties":[{"id":"custom.width","value":88}]},{"matcher":{"id":"byName","options":"Used, %"},"properties":[{"id":"unit","value":"percentunit"},{"id":"custom.cellOptions","value":{"type":"gauge"}},{"id":"max","value":1},{"id":"min","value":0}]}]},"gridPos":{"h":7,"w":12,"x":12,"y":19},"id":9,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n","format":"table","instant":true,"legendFormat":""},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n","format":"table","instant":true,"legendFormat":""}],"title":"Disk Space Usage","transformations":[{"id":"groupBy","options":{"fields":{"Value #A":{"aggregations":["lastNotNull"],"operation":"aggregate"},"Value #B":{"aggregations":["lastNotNull"],"operation":"aggregate"},"mountpoint":{"aggregations":[],"operation":"groupby"}}}},{"id":"merge"},{"id":"calculateField","options":{"alias":"Used","binary":{"left":"Value #A (lastNotNull)","operator":"-","reducer":"sum","right":"Value #B (lastNotNull)"},"mode":"binary","reduce":{"reducer":"sum"}}},{"id":"calculateField","options":{"alias":"Used, %","binary":{"left":"Used","operator":"/","reducer":"sum","right":"Value #A (lastNotNull)"},"mode":"binary","reduce":{"reducer":"sum"}}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Value #A (lastNotNull)":"Size","Value #B (lastNotNull)":"Available","mountpoint":"Mounted on"}}},{"id":"sortBy","options":{"fields":{},"sort":[{"field":"Mounted on"}]}}],"type":"table"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":26},"id":10,"panels":[],"title":"Network","type":"row"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"description":"Network received (bits/s)","fieldConfig":{"defaults":{"custom":{"fillOpacity":0,"showPoints":"never"},"min":0,"unit":"bps"}},"gridPos":{"h":7,"w":12,"x":0,"y":27},"id":11,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8","intervalFactor":1,"legendFormat":"{{device}}"}],"title":"Network Received","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"description":"Network transmitted (bits/s)","fieldConfig":{"defaults":{"custom":{"fillOpacity":0},"min":0,"unit":"bps"}},"gridPos":{"h":7,"w":12,"x":12,"y":27},"id":12,"options":{"tooltip":{"mode":"multi"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8","intervalFactor":1,"legendFormat":"{{device}}"}],"title":"Network Transmitted","type":"timeseries"}],"refresh":"30s","schemaVersion":39,"tags":["node-exporter-mixin"],"templating":{"list":[{"name":"datasource","query":"prometheus","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"Cluster","name":"cluster","query":"label_values(node_uname_info{job=\"node-exporter\", sysname!=\"Darwin\"}, cluster)","refresh":2,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"label":"Instance","name":"instance","query":"label_values(node_uname_info{job=\"node-exporter\", cluster=\"$cluster\", sysname!=\"Darwin\"}, instance)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Node Exporter / Nodes","uid":"7d57716318ee0dddbac5a7f451fb7753"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-persistentvolumesusage - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - persistentvolumesusage.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":7,"w":18,"y":0},"id":1,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n","legendFormat":"Used Space"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n","legendFormat":"Free Space"}],"title":"Volume Space Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"max":100,"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":0},{"color":"orange","value":80},{"color":"red","value":90}]},"unit":"percent"}},"gridPos":{"h":7,"w":6,"x":18,"y":0},"id":2,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n","instant":true}],"title":"Volume Space Usage","type":"gauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"none"}},"gridPos":{"h":7,"w":18,"y":7},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))","legendFormat":"Used inodes"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n","legendFormat":"Free inodes"}],"title":"Volume inodes Usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"mode":"thresholds"},"max":100,"min":0,"thresholds":{"mode":"absolute","steps":[{"color":"green","value":0},{"color":"orange","value":80},{"color":"red","value":90}]},"unit":"percent"}},"gridPos":{"h":7,"w":6,"x":18,"y":7},"id":4,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n","instant":true}],"title":"Volume inodes Usage","type":"gauge"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"Namespace","name":"namespace","query":"label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"PersistentVolumeClaim","name":"volume","query":"label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\"}, persistentvolumeclaim)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Persistent Volumes","uid":"919b92a8e8041bd567af9edab12c840c"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-pod-total - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - pod-total.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"displayName":"$pod","max":10000000000,"min":0,"thresholds":{"steps":[{"color":"dark-green","index":0,"value":null},{"color":"dark-yellow","index":1,"value":5000000000},{"color":"dark-red","index":2,"value":7000000000}]},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":0},"id":1,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))","legendFormat":"__auto"}],"title":"Current Rate of Bytes Received","type":"gauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"displayName":"$pod","max":10000000000,"min":0,"thresholds":{"steps":[{"color":"dark-green","index":0,"value":null},{"color":"dark-yellow","index":1,"value":5000000000},{"color":"dark-red","index":2,"value":7000000000}]},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":0},"id":2,"interval":"1m","pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))","legendFormat":"__auto"}],"title":"Current Rate of Bytes Transmitted","type":"gauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":0,"y":9},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":12,"y":9},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":18},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":18},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":27},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"showPoints":"never"},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":27},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"allValue":".+","current":{"selected":false,"text":"kube-system","value":"kube-system"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"namespace","name":"namespace","query":"label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"current":{"selected":false,"text":"kube-system","value":"kube-system"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"pod","name":"pod","query":"label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Networking / Pod","uid":"7a18067ce943a40ae25454675c19ff5c"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-prometheus - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - prometheus.json: |- - {"panels":[{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"id":1,"panels":[],"title":"Prometheus Stats","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"decimals":2,"displayName":"","unit":"short"},"overrides":[{"matcher":{"id":"byName","options":"Time"},"properties":[{"id":"displayName","value":"Time"},{"id":"custom.align","value":null},{"id":"custom.hidden","value":"true"}]},{"matcher":{"id":"byName","options":"cluster"},"properties":[{"id":"custom.align","value":null},{"id":"unit","value":"short"},{"id":"decimals","value":2},{"id":"displayName","value":"Cluster"}]},{"matcher":{"id":"byName","options":"job"},"properties":[{"id":"custom.align","value":null},{"id":"unit","value":"short"},{"id":"decimals","value":2},{"id":"displayName","value":"Job"}]},{"matcher":{"id":"byName","options":"instance"},"properties":[{"id":"displayName","value":"Instance"},{"id":"custom.align","value":null},{"id":"unit","value":"short"},{"id":"decimals","value":2}]},{"matcher":{"id":"byName","options":"version"},"properties":[{"id":"displayName","value":"Version"},{"id":"custom.align","value":null},{"id":"unit","value":"short"},{"id":"decimals","value":2}]},{"matcher":{"id":"byName","options":"Value #A"},"properties":[{"id":"displayName","value":"Count"},{"id":"custom.align","value":null},{"id":"unit","value":"short"},{"id":"decimals","value":2},{"id":"custom.hidden","value":"true"}]},{"matcher":{"id":"byName","options":"Value #B"},"properties":[{"id":"displayName","value":"Uptime"},{"id":"custom.align","value":null},{"id":"unit","value":"s"}]}]},"gridPos":{"h":7,"w":24,"x":0,"y":1},"id":2,"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"count by (cluster, job, instance, version) (prometheus_build_info{cluster=~\"$cluster\", job=~\"$job\", instance=~\"$instance\"})","format":"table","instant":true,"legendFormat":""},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~\"$cluster\", job=~\"$job\", instance=~\"$instance\"})","format":"table","instant":true,"legendFormat":""}],"title":"Prometheus Stats","type":"table"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":8},"id":3,"panels":[],"title":"Discovery","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never"},"min":0,"unit":"ms"}},"gridPos":{"h":7,"w":12,"x":0,"y":9},"id":4,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[5m])) by (cluster, job, scrape_job, instance) * 1e3","format":"time_series","legendFormat":"{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}"}],"title":"Target Sync","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":12,"y":9},"id":5,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"})","format":"time_series","legendFormat":"{{cluster}}:{{job}}:{{instance}}"}],"title":"Targets","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":16},"id":6,"panels":[],"title":"Retrieval","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never"},"min":0,"unit":"ms"}},"gridPos":{"h":7,"w":8,"x":0,"y":17},"id":7,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(prometheus_target_interval_length_seconds_sum{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}[5m]) * 1e3","format":"time_series","legendFormat":"{{cluster}}:{{job}}:{{instance}} {{interval}} configured"}],"title":"Average Scrape Interval Duration","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":8,"x":8,"y":17},"id":8,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))","format":"time_series","legendFormat":"exceeded body size limit: {{cluster}} {{job}} {{instance}}"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))","format":"time_series","legendFormat":"exceeded sample limit: {{cluster}} {{job}} {{instance}}"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))","format":"time_series","legendFormat":"duplicate timestamp: {{cluster}} {{job}} {{instance}}"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))","format":"time_series","legendFormat":"out of bounds: {{cluster}} {{job}} {{instance}}"},{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))","format":"time_series","legendFormat":"out of order: {{cluster}} {{job}} {{instance}}"}],"title":"Scrape failures","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":8,"x":16,"y":17},"id":9,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(prometheus_tsdb_head_samples_appended_total{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}[5m])","format":"time_series","legendFormat":"{{cluster}} {{job}} {{instance}}"}],"title":"Appended Samples","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":24},"id":10,"panels":[],"title":"Storage","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":0,"y":25},"id":11,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"prometheus_tsdb_head_series{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}","format":"time_series","legendFormat":"{{cluster}} {{job}} {{instance}} head series"}],"title":"Head Series","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":12,"y":25},"id":12,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"prometheus_tsdb_head_chunks{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}","format":"time_series","legendFormat":"{{cluster}} {{job}} {{instance}} head chunks"}],"title":"Head Chunks","type":"timeseries"},{"collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":32},"id":13,"panels":[],"title":"Query","type":"row"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"short"}},"gridPos":{"h":7,"w":12,"x":0,"y":33},"id":14,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"rate(prometheus_engine_query_duration_seconds_count{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\",slice=\"inner_eval\"}[5m])","format":"time_series","legendFormat":"{{cluster}} {{job}} {{instance}}"}],"title":"Query Rate","type":"timeseries"},{"datasource":{"type":"prometheus","uid":"$datasource"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":100,"lineWidth":0,"showPoints":"never","stacking":{"mode":"normal"}},"min":0,"unit":"ms"}},"gridPos":{"h":7,"w":12,"x":12,"y":33},"id":15,"options":{"tooltip":{"mode":"multi","sort":"desc"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"$datasource"},"expr":"max by (slice) (prometheus_engine_query_duration_seconds{quantile=\"0.9\",cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}) * 1e3","format":"time_series","legendFormat":"{{slice}}"}],"title":"Stage Duration","type":"timeseries"}],"schemaVersion":39,"tags":["prometheus-mixin"],"templating":{"list":[{"current":{"selected":false,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","type":"datasource"},{"allValue":".*","current":{"selected":false,"text":["$__all"],"value":["$__all"]},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"includeAll":true,"label":"cluster","multi":true,"name":"cluster","query":"label_values(prometheus_build_info{}, cluster)","refresh":2,"sort":2,"type":"query"},{"allValue":".+","datasource":{"type":"prometheus","uid":"${datasource}"},"includeAll":true,"label":"job","multi":true,"name":"job","query":"label_values(prometheus_build_info{cluster=~\"$cluster\"}, job)","refresh":2,"sort":2,"type":"query"},{"allValue":".+","datasource":{"type":"prometheus","uid":"${datasource}"},"includeAll":true,"label":"instance","multi":true,"name":"instance","query":"label_values(prometheus_build_info{cluster=~\"$cluster\", job=~\"$job\"}, instance)","refresh":2,"sort":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["60s"]},"timezone": "utc","title":"Prometheus / Overview","uid":"9fa0d141-d019-4ad7-8bc5-42196ee308bd"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-proxy - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - proxy.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(up{cluster=\"$cluster\", job=\"kube-proxy\"})","instant":true}],"title":"Up","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":10,"x":4,"y":0},"id":2,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval]))","legendFormat":"rate"}],"title":"Rules Sync Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":10,"x":14,"y":0},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval]))","legendFormat":"{{instance}}"}],"title":"Rules Sync Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":12,"x":0,"y":7},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval]))","legendFormat":"rate"}],"title":"Network Programming Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":12,"x":12,"y":7},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[$__rate_interval])) by (instance, le))","legendFormat":"{{instance}}"}],"title":"Network Programming Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":8,"x":0,"y":14},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))","legendFormat":"2xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))","legendFormat":"3xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))","legendFormat":"4xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))","legendFormat":"5xx"}],"title":"Kube API Request Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":16,"x":8,"y":14},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[$__rate_interval])) by (verb, le))","legendFormat":"{{verb}}"}],"title":"Post Request Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":21},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, le))","legendFormat":"{{verb}}"}],"title":"Get Request Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":0,"y":28},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":8,"y":28},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[$__rate_interval])","legendFormat":"{{instance}}"}],"title":"CPU usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":16,"y":28},"id":11,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"go_goroutines{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Goroutines","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-proxy\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"allValue":".+","datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"instance","name":"instance","query":"label_values(up{job=\"kube-proxy\", cluster=\"$cluster\", job=\"kube-proxy\"}, instance)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Proxy","uid":"632e265de029684c40b21cb76bca4f94"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-scheduler - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - scheduler.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"unit":"none"}},"gridPos":{"h":7,"w":4,"x":0,"y":0},"id":1,"interval":"1m","options":{"colorMode":"none"},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(up{cluster=\"$cluster\", job=\"kube-scheduler\"})","instant":true}],"title":"Up","type":"stat"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":10,"x":4,"y":0},"id":2,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)","legendFormat":"{{cluster}} {{instance}} e2e"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)","legendFormat":"{{cluster}} {{instance}} binding"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)","legendFormat":"{{cluster}} {{instance}} scheduling algorithm"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance)","legendFormat":"{{cluster}} {{instance}} volume"}],"title":"Scheduling Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":10,"x":14,"y":0},"id":3,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))","legendFormat":"{{cluster}} {{instance}} e2e"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))","legendFormat":"{{cluster}} {{instance}} binding"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))","legendFormat":"{{cluster}} {{instance}} scheduling algorithm"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[$__rate_interval])) by (cluster, instance, le))","legendFormat":"{{cluster}} {{instance}} volume"}],"title":"Scheduling latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":8,"x":0,"y":7},"id":4,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[$__rate_interval]))","legendFormat":"2xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[$__rate_interval]))","legendFormat":"3xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[$__rate_interval]))","legendFormat":"4xx"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[$__rate_interval]))","legendFormat":"5xx"}],"title":"Kube API Request Rate","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"ops"}},"gridPos":{"h":7,"w":16,"x":8,"y":7},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[$__rate_interval])) by (verb, le))","legendFormat":"{{verb}}"}],"title":"Post Request Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"s"}},"gridPos":{"h":7,"w":24,"x":0,"y":14},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[$__rate_interval])) by (verb, le))","legendFormat":"{{verb}}"}],"title":"Get Request Latency 99th Quantile","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bytes"}},"gridPos":{"h":7,"w":8,"x":0,"y":21},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Memory","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":8,"y":21},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[$__rate_interval])","legendFormat":"{{instance}}"}],"title":"CPU usage","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"short"}},"gridPos":{"h":7,"w":8,"x":16,"y":21},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"go_goroutines{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}","legendFormat":"{{instance}}"}],"title":"Goroutines","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(up{job=\"kube-scheduler\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"allValue":".+","datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"instance","name":"instance","query":"label_values(up{job=\"kube-scheduler\", cluster=\"$cluster\"}, instance)","refresh":2,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Scheduler","uid":"2e6b6a3b4bddf1427b3a55aa1311c656"} ---- -# Source: kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: prometheus - name: kube-prometheus-stack-1754-workload-total - annotations: - {} - labels: - grafana_dashboard: "1" - app: kube-prometheus-stack-grafana - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -data: - workload-total.json: |- - {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":0},"id":1,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Received","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":0},"id":2,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Transmitted","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":9},"id":3,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Average Rate of Bytes Received","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"Bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":9},"id":4,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Average Rate of Bytes Transmitted","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":0,"y":18},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"binBps"}},"gridPos":{"h":9,"w":12,"x":12,"y":18},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":27},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":27},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":36},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":36},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(kube_pod_info{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"allValue":".+","current":{"selected":false,"text":"kube-system","value":"kube-system"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"namespace","name":"namespace","query":"label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"workload","name":"workload","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload)","refresh":2,"sort":1,"type":"query"},{"allValue":".+","datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"workload_type","name":"type","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Networking / Workload","uid":"728bf77cc1166d2f3133bf25846876cc"} ---- -# Source: kube-prometheus-stack/charts/grafana/templates/clusterrole.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" - name: kube-prometheus-stack-1754164871-grafana-clusterrole -rules: - - apiGroups: [""] # "" indicates the core API group - resources: ["configmaps", "secrets"] - verbs: ["get", "watch", "list"] ---- -# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kube-prometheus-stack-1754-operator - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator -rules: -- apiGroups: - - monitoring.coreos.com - resources: - - alertmanagers - - alertmanagers/finalizers - - alertmanagers/status - - alertmanagerconfigs - - prometheuses - - prometheuses/finalizers - - prometheuses/status - - prometheusagents - - prometheusagents/finalizers - - prometheusagents/status - - thanosrulers - - thanosrulers/finalizers - - thanosrulers/status - - scrapeconfigs - - servicemonitors - - podmonitors - - probes - - prometheusrules - verbs: - - '*' -- apiGroups: - - apps - resources: - - statefulsets - verbs: - - '*' -- apiGroups: - - "" - resources: - - configmaps - - secrets - verbs: - - '*' -- apiGroups: - - "" - resources: - - pods - verbs: - - list - - delete -- apiGroups: - - "" - resources: - - services - - services/finalizers - - endpoints - verbs: - - get - - create - - update - - delete -- apiGroups: - - "" - resources: - - nodes - verbs: - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - patch - - create -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch -- apiGroups: - - storage.k8s.io - resources: - - storageclasses - verbs: - - get -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - create - - list - - watch - - update - - delete ---- -# Source: kube-prometheus-stack/templates/prometheus/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kube-prometheus-stack-1754-prometheus - labels: - app: kube-prometheus-stack-prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -rules: -# These permissions (to examine all namespaces) are not in the kube-prometheus repo. -# They're grabbed from https://github.com/prometheus/prometheus/blob/master/documentation/examples/rbac-setup.yml -# kube-prometheus deliberately defaults to a more restrictive setup that is not appropriate for our general audience. -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: ["discovery.k8s.io"] - resources: - - endpointslices - verbs: ["get", "list", "watch"] -- apiGroups: - - "networking.k8s.io" - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics", "/metrics/cadvisor"] - verbs: ["get"] ---- -# Source: kube-prometheus-stack/charts/grafana/templates/clusterrolebinding.yaml -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: kube-prometheus-stack-1754164871-grafana-clusterrolebinding - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -subjects: - - kind: ServiceAccount - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus -roleRef: - kind: ClusterRole - name: kube-prometheus-stack-1754164871-grafana-clusterrole - apiGroup: rbac.authorization.k8s.io ---- -# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-prometheus-stack-1754164871-kube-state-metrics -subjects: -- kind: ServiceAccount - name: kube-prometheus-stack-1754164871-kube-state-metrics - namespace: prometheus ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kube-prometheus-stack-1754-operator - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-prometheus-stack-1754-operator -subjects: -- kind: ServiceAccount - name: kube-prometheus-stack-1754-operator - namespace: prometheus ---- -# Source: kube-prometheus-stack/templates/prometheus/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kube-prometheus-stack-1754-prometheus - labels: - app: kube-prometheus-stack-prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-prometheus-stack-1754-prometheus -subjects: - - kind: ServiceAccount - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus ---- -# Source: kube-prometheus-stack/charts/grafana/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -rules: [] ---- -# Source: kube-prometheus-stack/charts/grafana/templates/rolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: kube-prometheus-stack-1754164871-grafana -subjects: -- kind: ServiceAccount - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus ---- -# Source: kube-prometheus-stack/charts/grafana/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -spec: - type: ClusterIP - ports: - - name: http-web - port: 80 - protocol: TCP - targetPort: 3000 - selector: - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 ---- -# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754164871-kube-state-metrics - namespace: prometheus - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 - annotations: -spec: - type: "ClusterIP" - ports: - - name: "http" - protocol: TCP - port: 8080 - targetPort: 8080 - - selector: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 ---- -# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - namespace: prometheus - labels: - helm.sh/chart: prometheus-node-exporter-4.47.3 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "1.9.1" - release: kube-prometheus-stack-1754164871 - jobLabel: node-exporter - annotations: - prometheus.io/scrape: "true" -spec: - type: ClusterIP - ports: - - port: 9100 - targetPort: 9100 - protocol: TCP - name: http-metrics - selector: - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 ---- -# Source: kube-prometheus-stack/templates/alertmanager/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-alertmanager - namespace: prometheus - labels: - app: kube-prometheus-stack-alertmanager - self-monitor: "true" - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - ports: - - name: http-web - port: 9093 - targetPort: 9093 - protocol: TCP - - name: reloader-web - appProtocol: http - port: 8080 - targetPort: reloader-web - selector: - app.kubernetes.io/name: alertmanager - alertmanager: kube-prometheus-stack-1754-alertmanager - sessionAffinity: None - type: "ClusterIP" ---- -# Source: kube-prometheus-stack/templates/exporters/core-dns/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-coredns - labels: - app: kube-prometheus-stack-coredns - jobLabel: coredns - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - namespace: kube-system -spec: - clusterIP: None - ports: - - name: http-metrics - port: 9153 - protocol: TCP - targetPort: 9153 - selector: - k8s-app: kube-dns ---- -# Source: kube-prometheus-stack/templates/exporters/kube-controller-manager/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-kube-controller-manager - labels: - app: kube-prometheus-stack-kube-controller-manager - jobLabel: kube-controller-manager - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - namespace: kube-system -spec: - clusterIP: None - ports: - - name: http-metrics - port: 10257 - protocol: TCP - targetPort: 10257 - selector: - component: kube-controller-manager - type: ClusterIP ---- -# Source: kube-prometheus-stack/templates/exporters/kube-etcd/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-kube-etcd - labels: - app: kube-prometheus-stack-kube-etcd - jobLabel: kube-etcd - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - namespace: kube-system -spec: - clusterIP: None - ports: - - name: http-metrics - port: 2381 - protocol: TCP - targetPort: 2381 - selector: - component: etcd - type: ClusterIP ---- -# Source: kube-prometheus-stack/templates/exporters/kube-proxy/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-kube-proxy - labels: - app: kube-prometheus-stack-kube-proxy - jobLabel: kube-proxy - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - namespace: kube-system -spec: - clusterIP: None - ports: - - name: http-metrics - port: 10249 - protocol: TCP - targetPort: 10249 - selector: - k8s-app: kube-proxy - type: ClusterIP ---- -# Source: kube-prometheus-stack/templates/exporters/kube-scheduler/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-kube-scheduler - labels: - app: kube-prometheus-stack-kube-scheduler - jobLabel: kube-scheduler - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - namespace: kube-system -spec: - clusterIP: None - ports: - - name: http-metrics - port: 10259 - protocol: TCP - targetPort: 10259 - selector: - component: kube-scheduler - type: ClusterIP ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-operator - namespace: prometheus - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator -spec: - ports: - - name: https - port: 443 - targetPort: https - selector: - app: kube-prometheus-stack-operator - release: "kube-prometheus-stack-1754164871" - type: "ClusterIP" ---- -# Source: kube-prometheus-stack/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus - labels: - app: kube-prometheus-stack-prometheus - self-monitor: "true" - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - externalTrafficPolicy: Cluster - ports: - - name: http-web - nodePort: 30090 - port: 9090 - targetPort: 9090 - - name: reloader-web - appProtocol: http - port: 8080 - targetPort: reloader-web - publishNotReadyAddresses: false - selector: - app.kubernetes.io/name: prometheus - operator.prometheus.io/name: kube-prometheus-stack-1754-prometheus - sessionAffinity: None - type: "NodePort" ---- -# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/daemonset.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - namespace: prometheus - labels: - helm.sh/chart: prometheus-node-exporter-4.47.3 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "1.9.1" - release: kube-prometheus-stack-1754164871 -spec: - selector: - matchLabels: - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - revisionHistoryLimit: 10 - updateStrategy: - rollingUpdate: - maxUnavailable: 1 - type: RollingUpdate - template: - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - labels: - helm.sh/chart: prometheus-node-exporter-4.47.3 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "1.9.1" - release: kube-prometheus-stack-1754164871 - jobLabel: node-exporter - spec: - automountServiceAccountToken: false - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccountName: kube-prometheus-stack-1754164871-prometheus-node-exporter - containers: - - name: node-exporter - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9100 - - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) - - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs|erofs)$ - securityContext: - readOnlyRootFilesystem: true - env: - - name: HOST_IP - value: 0.0.0.0 - ports: - - name: http-metrics - containerPort: 9100 - protocol: TCP - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: http-metrics - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: http-metrics - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - hostNetwork: true - hostPID: true - hostIPC: false - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/compute-type - operator: NotIn - values: - - fargate - - key: type - operator: NotIn - values: - - virtual-kubelet - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - operator: Exists - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / ---- -# Source: kube-prometheus-stack/charts/grafana/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -spec: - replicas: 1 - revisionHistoryLimit: 10 - selector: - matchLabels: - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - strategy: - type: RollingUpdate - template: - metadata: - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" - annotations: - checksum/config: 0e9cbd0ea8e24e32f7dfca5bab17a2ba05652642f0a09a4882833ae88e4cc4a3 - checksum/sc-dashboard-provider-config: e70bf6a851099d385178a76de9757bb0bef8299da6d8443602590e44f05fdf24 - checksum/secret: 032056e9c62bbe9d1daa41ee49cd3d9524c076f51ca4c65adadf4ef08ef28712 - kubectl.kubernetes.io/default-container: grafana - spec: - - serviceAccountName: kube-prometheus-stack-1754164871-grafana - automountServiceAccountToken: true - shareProcessNamespace: false - securityContext: - fsGroup: 472 - runAsGroup: 472 - runAsNonRoot: true - runAsUser: 472 - enableServiceLinks: true - containers: - - name: grafana-sc-dashboard - image: "quay.io/kiwigrid/k8s-sidecar:1.30.3" - imagePullPolicy: IfNotPresent - env: - - name: METHOD - value: WATCH - - name: LABEL - value: "grafana_dashboard" - - name: LABEL_VALUE - value: "1" - - name: FOLDER - value: "/tmp/dashboards" - - name: RESOURCE - value: "both" - - name: NAMESPACE - value: "ALL" - - name: REQ_USERNAME - valueFrom: - secretKeyRef: - name: kube-prometheus-stack-1754164871-grafana - key: admin-user - - name: REQ_PASSWORD - valueFrom: - secretKeyRef: - name: kube-prometheus-stack-1754164871-grafana - key: admin-password - - name: REQ_URL - value: http://localhost:3000/api/admin/provisioning/dashboards/reload - - name: REQ_METHOD - value: POST - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - volumeMounts: - - name: sc-dashboard-volume - mountPath: "/tmp/dashboards" - - name: grafana-sc-datasources - image: "quay.io/kiwigrid/k8s-sidecar:1.30.3" - imagePullPolicy: IfNotPresent - env: - - name: METHOD - value: WATCH - - name: LABEL - value: "grafana_datasource" - - name: LABEL_VALUE - value: "1" - - name: FOLDER - value: "/etc/grafana/provisioning/datasources" - - name: RESOURCE - value: "both" - - name: REQ_USERNAME - valueFrom: - secretKeyRef: - name: kube-prometheus-stack-1754164871-grafana - key: admin-user - - name: REQ_PASSWORD - valueFrom: - secretKeyRef: - name: kube-prometheus-stack-1754164871-grafana - key: admin-password - - name: REQ_URL - value: http://localhost:3000/api/admin/provisioning/datasources/reload - - name: REQ_METHOD - value: POST - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - volumeMounts: - - name: sc-datasources-volume - mountPath: "/etc/grafana/provisioning/datasources" - - name: grafana - image: "docker.io/grafana/grafana:12.1.0" - imagePullPolicy: IfNotPresent - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - volumeMounts: - - name: config - mountPath: "/etc/grafana/grafana.ini" - subPath: grafana.ini - - name: storage - mountPath: "/var/lib/grafana" - - name: sc-dashboard-volume - mountPath: "/tmp/dashboards" - - name: sc-dashboard-provider - mountPath: "/etc/grafana/provisioning/dashboards/sc-dashboardproviders.yaml" - subPath: provider.yaml - - name: sc-datasources-volume - mountPath: "/etc/grafana/provisioning/datasources" - ports: - - name: grafana - containerPort: 3000 - protocol: TCP - - name: gossip-tcp - containerPort: 9094 - protocol: TCP - - name: gossip-udp - containerPort: 9094 - protocol: UDP - - name: profiling - containerPort: 6060 - protocol: TCP - env: - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - name: kube-prometheus-stack-1754164871-grafana - key: admin-user - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: kube-prometheus-stack-1754164871-grafana - key: admin-password - - name: GF_PATHS_DATA - value: /var/lib/grafana/ - - name: GF_PATHS_LOGS - value: /var/log/grafana - - name: GF_PATHS_PLUGINS - value: /var/lib/grafana/plugins - - name: GF_PATHS_PROVISIONING - value: /etc/grafana/provisioning - livenessProbe: - failureThreshold: 10 - httpGet: - path: /api/health - port: 3000 - initialDelaySeconds: 60 - timeoutSeconds: 30 - readinessProbe: - httpGet: - path: /api/health - port: 3000 - volumes: - - name: config - configMap: - name: kube-prometheus-stack-1754164871-grafana - - name: storage - emptyDir: {} - - name: sc-dashboard-volume - emptyDir: {} - - name: sc-dashboard-provider - configMap: - name: kube-prometheus-stack-1754164871-grafana-config-dashboards - - name: sc-datasources-volume - emptyDir: {} ---- -# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kube-prometheus-stack-1754164871-kube-state-metrics - namespace: prometheus - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 -spec: - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - replicas: 1 - strategy: - type: RollingUpdate - revisionHistoryLimit: 10 - template: - metadata: - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 - spec: - automountServiceAccountToken: true - hostNetwork: false - serviceAccountName: kube-prometheus-stack-1754164871-kube-state-metrics - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - dnsPolicy: ClusterFirst - containers: - - name: kube-state-metrics - args: - - --port=8080 - - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments - imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.16.0 - ports: - - containerPort: 8080 - name: "http" - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: - {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kube-prometheus-stack-1754-operator - namespace: prometheus - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator -spec: - replicas: 1 - revisionHistoryLimit: 10 - selector: - matchLabels: - app: kube-prometheus-stack-operator - release: "kube-prometheus-stack-1754164871" - template: - metadata: - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator - spec: - containers: - - name: kube-prometheus-stack - image: "quay.io/prometheus-operator/prometheus-operator:v0.83.0" - imagePullPolicy: "IfNotPresent" - args: - - --kubelet-service=kube-system/kube-prometheus-stack-1754-kubelet - - --kubelet-endpoints=true - - --kubelet-endpointslice=false - - --localhost=127.0.0.1 - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - - --config-reloader-cpu-request=0 - - --config-reloader-cpu-limit=0 - - --config-reloader-memory-request=0 - - --config-reloader-memory-limit=0 - - --thanos-default-base-image=quay.io/thanos/thanos:v0.39.2 - - --secret-field-selector=type!=kubernetes.io/dockercfg,type!=kubernetes.io/service-account-token,type!=helm.sh/release.v1 - - --web.enable-tls=true - - --web.cert-file=/cert/cert - - --web.key-file=/cert/key - - --web.listen-address=:10250 - - --web.tls-min-version=VersionTLS13 - ports: - - containerPort: 10250 - name: https - env: - - name: GOGC - value: "30" - resources: - {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - volumeMounts: - - name: tls-secret - mountPath: /cert - readOnly: true - readinessProbe: - httpGet: - path: /healthz - port: https - scheme: HTTPS - initialDelaySeconds: 0 - periodSeconds: 10 - timeoutSeconds: 1 - successThreshold: 1 - failureThreshold: 3 - livenessProbe: - httpGet: - path: /healthz - port: https - scheme: HTTPS - initialDelaySeconds: 0 - periodSeconds: 10 - timeoutSeconds: 1 - successThreshold: 1 - failureThreshold: 3 - volumes: - - name: tls-secret - secret: - defaultMode: 420 - secretName: kube-prometheus-stack-1754-admission - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - serviceAccountName: kube-prometheus-stack-1754-operator - automountServiceAccountToken: true - terminationGracePeriodSeconds: 30 ---- -# Source: kube-prometheus-stack/templates/alertmanager/alertmanager.yaml -apiVersion: monitoring.coreos.com/v1 -kind: Alertmanager -metadata: - name: kube-prometheus-stack-1754-alertmanager - namespace: prometheus - labels: - app: kube-prometheus-stack-alertmanager - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - image: "quay.io/prometheus/alertmanager:v0.28.1" - imagePullPolicy: "IfNotPresent" - version: v0.28.1 - replicas: 1 - listenLocal: false - serviceAccountName: kube-prometheus-stack-1754-alertmanager - automountServiceAccountToken: true - externalUrl: http://kube-prometheus-stack-1754-alertmanager.prometheus:9093 - paused: false - logFormat: "logfmt" - logLevel: "info" - retention: "120h" - alertmanagerConfigSelector: {} - alertmanagerConfigNamespaceSelector: - {} - routePrefix: "/" - securityContext: - fsGroup: 2000 - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - topologyKey: kubernetes.io/hostname - labelSelector: - matchExpressions: - - {key: app.kubernetes.io/name, operator: In, values: [alertmanager]} - - {key: alertmanager, operator: In, values: [kube-prometheus-stack-1754-alertmanager]} - portName: http-web ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: kube-prometheus-stack-1754-admission - annotations: - - labels: - app: kube-prometheus-stack-admission - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator-webhook -webhooks: - - name: prometheusrulemutate.monitoring.coreos.com - failurePolicy: Ignore - rules: - - apiGroups: - - monitoring.coreos.com - apiVersions: - - "*" - resources: - - prometheusrules - operations: - - CREATE - - UPDATE - clientConfig: - service: - namespace: prometheus - name: kube-prometheus-stack-1754-operator - path: /admission-prometheusrules/mutate - timeoutSeconds: 10 - admissionReviewVersions: ["v1", "v1beta1"] - sideEffects: None ---- -# Source: kube-prometheus-stack/templates/prometheus/prometheus.yaml -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus - labels: - app: kube-prometheus-stack-prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - automountServiceAccountToken: true - alerting: - alertmanagers: - - namespace: prometheus - name: kube-prometheus-stack-1754-alertmanager - port: http-web - pathPrefix: "/" - apiVersion: v2 - image: "quay.io/prometheus/prometheus:v3.5.0" - imagePullPolicy: "IfNotPresent" - version: v3.5.0 - externalUrl: http://kube-prometheus-stack-1754-prometheus.prometheus:9090 - paused: false - replicas: 1 - shards: 1 - logLevel: info - logFormat: logfmt - listenLocal: false - enableOTLPReceiver: false - enableAdminAPI: false - retention: "10d" - tsdb: - outOfOrderTimeWindow: 0s - walCompression: true - routePrefix: "/" - serviceAccountName: kube-prometheus-stack-1754-prometheus - serviceMonitorSelector: {} - serviceMonitorNamespaceSelector: {} - podMonitorSelector: - matchLabels: - release: "kube-prometheus-stack-1754164871" - - podMonitorNamespaceSelector: {} - probeSelector: - matchLabels: - release: "kube-prometheus-stack-1754164871" - - probeNamespaceSelector: {} - securityContext: - fsGroup: 2000 - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault - ruleNamespaceSelector: {} - ruleSelector: - matchLabels: - release: "kube-prometheus-stack-1754164871" - - scrapeConfigSelector: - matchLabels: - release: "kube-prometheus-stack-1754164871" - - scrapeConfigNamespaceSelector: {} - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - topologyKey: kubernetes.io/hostname - labelSelector: - matchExpressions: - - {key: app.kubernetes.io/name, operator: In, values: [prometheus]} - - {key: app.kubernetes.io/instance, operator: In, values: [kube-prometheus-stack-1754-prometheus]} - portName: http-web - hostNetwork: false ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-alertmanager.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerFailedReload - annotations: - description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload - summary: Reloading an Alertmanager configuration has failed. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_config_last_reload_successful{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: AlertmanagerMembersInconsistent - annotations: - description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent - summary: A member of an Alertmanager cluster has not found all other cluster members. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[5m]) - < on (namespace,service,cluster) group_left - count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[5m])) - for: 15m - labels: - severity: critical - - alert: AlertmanagerFailedToSendAlerts - annotations: - description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts - summary: An Alertmanager instance failed to send notifications. - expr: |- - ( - rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[15m]) - / - ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[15m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts - summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. - expr: |- - min by (namespace,service, integration) ( - rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus", integration=~`.*`}[15m]) - / - ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus", integration=~`.*`}[15m]) - ) - > 0.01 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts - summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. - expr: |- - min by (namespace,service, integration) ( - rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus", integration!~`.*`}[15m]) - / - ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus", integration!~`.*`}[15m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerConfigInconsistent - annotations: - description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent - summary: Alertmanager instances within the same cluster have different configurations. - expr: |- - count by (namespace,service,cluster) ( - count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}) - ) - != 1 - for: 20m - labels: - severity: critical - - alert: AlertmanagerClusterDown - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown - summary: Half or more of the Alertmanager instances within the same cluster are down. - expr: |- - ( - count by (namespace,service,cluster) ( - avg_over_time(up{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[5m]) < 0.5 - ) - / - count by (namespace,service,cluster) ( - up{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterCrashlooping - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping - summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. - expr: |- - ( - count by (namespace,service,cluster) ( - changes(process_start_time_seconds{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"}[10m]) > 4 - ) - / - count by (namespace,service,cluster) ( - up{job="kube-prometheus-stack-1754-alertmanager",namespace="prometheus"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-config-reloaders - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: config-reloaders - rules: - - alert: ConfigReloaderSidecarErrors - annotations: - description: 'Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace. - - As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors - summary: config-reloader sidecar has not had a successful reload for 10m - expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 - for: 10m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-etcd - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: etcd - rules: - - alert: etcdMembersDown - annotations: - description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).' - summary: etcd cluster members are down. - expr: |- - max without (endpoint) ( - sum without (instance, pod) (up{job=~".*etcd.*"} == bool 0) - or - count without (To) ( - sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 - ) - ) - > 0 - for: 20m - labels: - severity: warning - - alert: etcdInsufficientMembers - annotations: - description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' - summary: etcd cluster has insufficient number of members. - expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance, pod) < ((count(up{job=~".*etcd.*"}) without (instance, pod) + 1) / 2) - for: 3m - labels: - severity: critical - - alert: etcdNoLeader - annotations: - description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' - summary: etcd cluster has no leader. - expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m - labels: - severity: critical - - alert: etcdHighNumberOfLeaderChanges - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' - summary: etcd cluster has high number of leader changes. - expr: increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 - for: 5m - labels: - severity: warning - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' - summary: etcd cluster has high number of failed grpc requests. - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) - > 1 - for: 10m - labels: - severity: warning - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' - summary: etcd cluster has high number of failed grpc requests. - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) - > 5 - for: 5m - labels: - severity: critical - - alert: etcdGRPCRequestsSlow - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.' - summary: etcd grpc requests are slow - expr: |- - histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) - > 0.15 - for: 10m - labels: - severity: critical - - alert: etcdMemberCommunicationSlow - annotations: - description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster member communication is slow. - expr: |- - histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.15 - for: 10m - labels: - severity: warning - - alert: etcdHighNumberOfFailedProposals - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' - summary: etcd cluster has high number of proposal failures. - expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m - labels: - severity: warning - - alert: etcdHighFsyncDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile fsync durations are too high. - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.5 - for: 10m - labels: - severity: warning - - alert: etcdHighFsyncDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile fsync durations are too high. - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 1 - for: 10m - labels: - severity: critical - - alert: etcdHighCommitDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile commit durations are too high. - expr: |- - histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.25 - for: 10m - labels: - severity: warning - - alert: etcdDatabaseQuotaLowSpace - annotations: - description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' - summary: etcd cluster database is running full. - expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 - for: 10m - labels: - severity: critical - - alert: etcdExcessiveDatabaseGrowth - annotations: - description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' - summary: etcd cluster database growing very fast. - expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"} - for: 10m - labels: - severity: warning - - alert: etcdDatabaseHighFragmentationRatio - annotations: - description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' - runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation - summary: etcd database size in use is less than 50% of the actual allocated storage. - expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 - for: 10m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-general.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: general.rules - rules: - - alert: TargetDown - annotations: - description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown - summary: One or more targets are unreachable. - expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: Watchdog - annotations: - description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. - - This alert is always firing, therefore it should always be firing in Alertmanager - - and always fire against a receiver. There are integrations with various notification - - mechanisms that send a notification when this alert is not firing. For example the - - "DeadMansSnitch" integration in PagerDuty. - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog - summary: An alert that should always be firing to certify that Alertmanager is working properly. - expr: vector(1) - labels: - severity: none - - alert: InfoInhibitor - annotations: - description: 'This is an alert that is used to inhibit info alerts. - - By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with - - other alerts. - - This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a - - severity of ''warning'' or ''critical'' starts firing on the same namespace. - - This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor - summary: Info-level alert inhibition. - expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 - labels: - severity: none ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.container-cpu-usage-second - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.container_cpu_usage_seconds_total - rules: - - expr: |- - sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m - - expr: |- - sum by (cluster, namespace, pod, container) ( - irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.container-memory-cache - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.container_memory_cache - rules: - - expr: |- - container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, - max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.container-memory-rss - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.container_memory_rss - rules: - - expr: |- - container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, - max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.container-memory-swap - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.container_memory_swap - rules: - - expr: |- - container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, - max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.container-memory-working-s - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.container_memory_working_set_bytes - rules: - - expr: |- - container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, - max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.container-resource - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.container_resource - rules: - - expr: |- - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} - ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_requests:sum - - expr: |- - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} - ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_requests:sum - - expr: |- - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} - ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_limits:sum - - expr: |- - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} - ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_limits:sum ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-k8s.rules.pod-owner - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: k8s.rules.pod_owner - rules: - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on (cluster, replicaset, namespace) group_left(owner_name) topk by (cluster, replicaset, namespace) ( - 1, max by (cluster, replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics", owner_kind=""} - ) - ), - "workload", "$1", "replicaset", "(.*)" - ) - ) - labels: - workload_type: replicaset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on (replicaset, namespace, cluster) group_left(owner_name) topk by (cluster, replicaset, namespace) ( - 1, max by (cluster, replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics", owner_kind="Deployment"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: deployment - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: daemonset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)") - ) - labels: - workload_type: statefulset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - group by (cluster, namespace, workload, pod) ( - label_join( - group by (cluster, namespace, job_name, pod, owner_name) ( - label_join( - kube_pod_owner{job="kube-state-metrics", owner_kind="Job"} - , "job_name", "", "owner_name") - ) - * on (cluster, namespace, job_name) group_left() - group by (cluster, namespace, job_name) ( - kube_job_owner{job="kube-state-metrics", owner_kind=~"Pod|"} - ) - , "workload", "", "owner_name") - ) - labels: - workload_type: job - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="", owner_name=""}, - "workload", "$1", "pod", "(.+)") - ) - labels: - workload_type: barepod - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="Node"}, - "workload", "$1", "pod", "(.+)") - ) - labels: - workload_type: staticpod - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - group by (cluster, namespace, workload, workload_type, pod) ( - label_join( - label_join( - group by (cluster, namespace, job_name, pod) ( - label_join( - kube_pod_owner{job="kube-state-metrics", owner_kind="Job"} - , "job_name", "", "owner_name") - ) - * on (cluster, namespace, job_name) group_left(owner_kind, owner_name) - group by (cluster, namespace, job_name, owner_kind, owner_name) ( - kube_job_owner{job="kube-state-metrics", owner_kind!="Pod", owner_kind!=""} - ) - , "workload", "", "owner_name") - , "workload_type", "", "owner_kind") - - OR - - label_replace( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"} - , "replicaset", "$1", "owner_name", "(.+)" - ) - * on (cluster, namespace, replicaset) group_left(owner_kind, owner_name) - group by (cluster, namespace, replicaset, owner_kind, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics", owner_kind!="Deployment", owner_kind!=""} - ) - , "workload", "$1", "owner_name", "(.+)") - OR - label_replace( - group by (cluster, namespace, pod, owner_name, owner_kind) ( - kube_pod_owner{job="kube-state-metrics", owner_kind!="ReplicaSet", owner_kind!="DaemonSet", owner_kind!="StatefulSet", owner_kind!="Job", owner_kind!="Node", owner_kind!=""} - ) - , "workload", "$1", "owner_name", "(.+)" - ) - , "workload_type", "$1", "owner_kind", "(.+)") - ) - record: namespace_workload_pod:kube_pod_owner:relabel ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-apiserver-availability.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - interval: 3m - name: kube-apiserver-availability.rules - rules: - - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 - record: code_verb:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) - labels: - verb: read - record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - labels: - verb: write - record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) - record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h - - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) - record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d - - expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"}) - record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h - - expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"}) - record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d - - expr: |- - 1 - ( - ( - # write too slow - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - - - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0)) - ) + - ( - # read too slow - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - - - ( - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0)) - + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0)) - + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0)) - ) - ) + - # errors - sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) - ) - / - sum by (cluster) (code:apiserver_request_total:increase30d) - labels: - verb: all - record: apiserver_request:availability30d - - expr: |- - 1 - ( - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - - - ( - # too slow - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0)) - + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0)) - + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0)) - ) - + - # errors - sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) - ) - / - sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) - labels: - verb: read - record: apiserver_request:availability30d - - expr: |- - 1 - ( - ( - # too slow - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - - - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0)) - ) - + - # errors - sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) - ) - / - sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) - labels: - verb: write - record: apiserver_request:availability30d - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-apiserver-burnrate.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-apiserver-burnrate.rules - rules: - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1d])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1d])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1d])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) - labels: - verb: read - record: apiserver_request:burnrate1d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1h])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) - labels: - verb: read - record: apiserver_request:burnrate1h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[2h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[2h])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[2h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) - labels: - verb: read - record: apiserver_request:burnrate2h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[30m])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[30m])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[30m])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) - labels: - verb: read - record: apiserver_request:burnrate30m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[3d])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[3d])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[3d])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) - labels: - verb: read - record: apiserver_request:burnrate3d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[5m])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[5m])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[5m])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: apiserver_request:burnrate5m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - - - ( - ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[6h])) - or - vector(0) - ) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[6h])) - + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[6h])) - ) - ) - + - # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1d])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - labels: - verb: write - record: apiserver_request:burnrate1d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - labels: - verb: write - record: apiserver_request:burnrate1h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[2h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[30m])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[3d])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[5m])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: |- - ( - ( - # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[6h])) - ) - + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-apiserver-histogram.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-apiserver-histogram.rules - rules: - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 - labels: - quantile: '0.99' - verb: read - record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 - labels: - quantile: '0.99' - verb: write - record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-apiserver-slos - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum by (cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000) - and on (cluster) - sum by (cluster) (apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum by (cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000) - and on (cluster) - sum by (cluster) (apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum by (cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000) - and on (cluster) - sum by (cluster) (apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum by (cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000) - and on (cluster) - sum by (cluster) (apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-prometheus-general.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-prometheus-node-recording.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON (instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-scheduler.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-scheduler.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kube-state-metrics - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors - summary: kube-state-metrics is experiencing errors in list operations. - expr: |- - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors - summary: kube-state-metrics is experiencing errors in watch operations. - expr: |- - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsShardingMismatch - annotations: - description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch - summary: kube-state-metrics sharding is misconfigured. - expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsShardsMissing - annotations: - description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing - summary: kube-state-metrics shards are missing. - expr: |- - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 - - - sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) - != 0 - for: 15m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubelet.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubelet.rules - rules: - - expr: |- - histogram_quantile( - 0.99, - sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) - * on (cluster, instance) group_left (node) - max by (cluster, instance, node) (kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - ) - labels: - quantile: '0.99' - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: |- - histogram_quantile( - 0.9, - sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) - * on (cluster, instance) group_left (node) - max by (cluster, instance, node) (kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - ) - labels: - quantile: '0.9' - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: |- - histogram_quantile( - 0.5, - sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) - * on (cluster, instance) group_left (node) - max by (cluster, instance, node) (kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - ) - labels: - quantile: '0.5' - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-apps - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping - summary: Pod is crash looping. - expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: |- - sum by (namespace, pod, cluster) ( - max by (namespace, pod, cluster) ( - kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} - ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( - 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: |- - kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} - != - kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: |- - ( - kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} - > - kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDeploymentRolloutStuck - annotations: - description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck - summary: Deployment rollout is not progressing. - expr: |- - kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"} - != 0 - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch - summary: StatefulSet has not matched the expected number of replicas. - expr: |- - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"} - != - kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: |- - kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: |- - ( - max by (namespace, statefulset, job, cluster) ( - kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"} - ) - * on (namespace, statefulset, job, cluster) - ( - kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} - ) - ) and on (namespace, statefulset, job, cluster) ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: |- - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} - != - 0 - ) or ( - kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - ) - ) and ( - changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeContainerWaiting - annotations: - description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}") on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"} > 0 - for: 1h - labels: - severity: warning - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: |- - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 - for: 10m - labels: - severity: warning - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeJobNotCompleted - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted - summary: Job did not complete in time - expr: |- - time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} - and - kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 - labels: - severity: warning - - alert: KubeJobFailed - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed - summary: Job failed to complete. - expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch - summary: HPA has not matched desired number of replicas. - expr: |- - (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} - != - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) - and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} - > - kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) - and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} - < - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) - and - changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout - summary: HPA is running at max replicas - expr: |- - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} - == - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} - for: 15m - labels: - severity: warning - - alert: KubePdbNotEnoughHealthyPods - annotations: - description: PDB {{ $labels.cluster }}/{{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least 15m. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods - summary: PDB does not have enough healthy pods. - expr: |- - ( - kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics", namespace=~".*"} - - - kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics", namespace=~".*"} - ) - > 0 - for: 15m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-resources - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ printf "%.2f" $value }} CPU shares and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - and - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - for: 10m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - and - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - for: 10m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) - / - sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) - / - sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused - summary: Namespace quota is fully used. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }} on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) - / on (cluster, namespace, pod, container, instance) group_left - sum(increase(container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) - > ( 25 / 100 ) - for: 15m - labels: - severity: info ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-storage - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: |- - ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.03 - and - kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: |- - ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.15 - and - kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeInodesFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup - summary: PersistentVolumeInodes are filling up. - expr: |- - ( - kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.03 - and - kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeInodesFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup - summary: PersistentVolumeInodes are filling up. - expr: |- - ( - kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.15 - and - kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - and - predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-system-apiserver - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-system-apiserver - rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: |- - histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - and - on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 - for: 5m - labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: |- - histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - and - on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 - for: 5m - labels: - severity: critical - - alert: KubeAggregatedAPIErrors - annotations: - description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors - summary: Kubernetes aggregated API has reported errors. - expr: sum by (cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 - for: 10m - labels: - severity: warning - - alert: KubeAggregatedAPIDown - annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown - summary: Kubernetes aggregated API is down. - expr: (1 - max by (name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 - for: 5m - labels: - severity: warning - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical - - alert: KubeAPITerminatedRequests - annotations: - description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests - summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. - expr: sum by (cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by (cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by (cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 - for: 5m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-system-controller-manager - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-system-controller-manager - rules: - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-system-kube-proxy - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-system-kube-proxy - rules: - - alert: KubeProxyDown - annotations: - description: KubeProxy has disappeared from Prometheus target discovery. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kube-proxy"} == 1) - for: 15m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-system-kubelet - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready - summary: Node is not ready. - expr: |- - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - and on (cluster, node) - kube_node_spec_unschedulable{job="kube-state-metrics"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodePressure - annotations: - description: '{{ $labels.node }} on cluster {{ $labels.cluster }} has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure - summary: Node has as active Condition. - expr: |- - kube_node_status_condition{job="kube-state-metrics",condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1 - and on (cluster, node) - kube_node_spec_unschedulable{job="kube-state-metrics"} == 0 - for: 10m - labels: - severity: info - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled on cluster {{ $labels.cluster }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable - summary: Node is unreachable. - expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m - labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods - summary: Kubelet is running at capacity. - expr: |- - ( - max by (cluster, instance) ( - kubelet_running_pods{job="kubelet", metrics_path="/metrics"} > 1 - ) - * on (cluster, instance) group_left(node) - max by (cluster, instance, node) ( - kubelet_node_name{job="kubelet", metrics_path="/metrics"} - ) - ) - / on (cluster, node) group_left() - max by (cluster, node) ( - kube_node_status_capacity{job="kube-state-metrics", resource="pods"} != 1 - ) > 0.95 - for: 15m - labels: - severity: info - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: |- - sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 - and on (cluster, node) - kube_node_spec_unschedulable{job="kube-state-metrics"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodeEviction - annotations: - description: Node {{ $labels.node }} on {{ $labels.cluster }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeeviction - summary: Node is evicting pods. - expr: |- - sum(rate(kubelet_evictions{job="kubelet", metrics_path="/metrics"}[15m])) by (cluster, eviction_signal, instance) - * on (cluster, instance) group_left(node) - max by (cluster, instance, node) ( - kubelet_node_name{job="kubelet", metrics_path="/metrics"} - ) - > 0 - for: 0s - labels: - severity: info - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }} on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m - labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }} on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: kubelet_certificate_manager_client_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: kubelet_certificate_manager_client_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: kubelet_certificate_manager_server_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: kubelet_certificate_manager_server_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes) on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes) on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-system-scheduler - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-system-scheduler - rules: - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-kubernetes-system - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{ $value }} different semantic versions of Kubernetes components running on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m - labels: - severity: warning - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors on cluster {{ $labels.cluster }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: |- - (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) - / - sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) - > 0.01 - for: 15m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-node-exporter.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: node-exporter.rules - rules: - - expr: |- - count without (cpu, mode) ( - node_cpu_seconds_total{job="node-exporter",mode="idle"} - ) - record: instance:node_num_cpu:sum - - expr: |- - 1 - avg without (cpu) ( - sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) - ) - record: instance:node_cpu_utilisation:rate5m - - expr: |- - ( - node_load1{job="node-exporter"} - / - instance:node_num_cpu:sum{job="node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: |- - 1 - ( - ( - node_memory_MemAvailable_bytes{job="node-exporter"} - or - ( - node_memory_Buffers_bytes{job="node-exporter"} - + - node_memory_Cached_bytes{job="node-exporter"} - + - node_memory_MemFree_bytes{job="node-exporter"} - + - node_memory_Slab_bytes{job="node-exporter"} - ) - ) - / - node_memory_MemTotal_bytes{job="node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) - record: instance:node_vmstat_pgmajfault:rate5m - - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) - record: instance_device:node_disk_io_time_seconds:rate5m - - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate5m - - expr: |- - sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate5m - - expr: |- - sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate5m - - expr: |- - sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate5m - - expr: |- - sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate5m ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-node-exporter - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 30m - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 30m - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected - summary: Clock skew detected. - expr: |- - ( - node_timex_offset_seconds{job="node-exporter"} > 0.05 - and - deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds{job="node-exporter"} < -0.05 - and - deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising - summary: Clock not synchronising. - expr: |- - min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 - and - node_timex_maxerror_seconds{job="node-exporter"} >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded - summary: RAID Array is degraded. - expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure - summary: Failed device in RAID array. - expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 - labels: - severity: warning - - alert: NodeFileDescriptorLimit - annotations: - description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit - summary: Kernel is predicted to exhaust file descriptors limit soon. - expr: |- - ( - node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 - ) - for: 15m - labels: - severity: warning - - alert: NodeFileDescriptorLimit - annotations: - description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit - summary: Kernel is predicted to exhaust file descriptors limit soon. - expr: |- - ( - node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 - ) - for: 15m - labels: - severity: critical - - alert: NodeCPUHighUsage - annotations: - description: 'CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage - summary: High CPU usage. - expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!~"idle|iowait"}[2m]))) * 100 > 90 - for: 15m - labels: - severity: info - - alert: NodeSystemSaturation - annotations: - description: 'System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. - - This might indicate this instance resources saturation and can cause it becoming unresponsive. - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation - summary: System saturated, load per core is very high. - expr: |- - node_load1{job="node-exporter"} - / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 - for: 15m - labels: - severity: warning - - alert: NodeMemoryMajorPagesFaults - annotations: - description: 'Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. - - Please check that there is enough memory available at this instance. - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults - summary: Memory major page faults are occurring at very high rate. - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 - for: 15m - labels: - severity: warning - - alert: NodeMemoryHighUtilization - annotations: - description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization - summary: Host is running out of memory. - expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - - alert: NodeDiskIOSaturation - annotations: - description: 'Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - - This symptom might indicate disk saturation. - - ' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation - summary: Disk IO queue is high. - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10 - for: 30m - labels: - severity: warning - - alert: NodeSystemdServiceFailed - annotations: - description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }} - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed - summary: Systemd service has entered failed state. - expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m - labels: - severity: warning - - alert: NodeSystemdServiceCrashlooping - annotations: - description: Systemd service {{ $labels.name }} has being restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicecrashlooping - summary: Systemd service keeps restaring, possibly crash looping. - expr: increase(node_systemd_service_restart_total{job="node-exporter"}[5m]) > 2 - for: 15m - labels: - severity: warning - - alert: NodeBondingDegraded - annotations: - description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded - summary: Bonding interface is degraded - expr: (node_bonding_slaves - node_bonding_active) != 0 - for: 5m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-node-network - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: node-network - rules: - - alert: NodeNetworkInterfaceFlapping - annotations: - description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping - summary: Network interface is often changing its status - expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-node.rules - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: node.rules - rules: - - expr: |- - topk by (cluster, namespace, pod) (1, - max by (cluster, node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") - )) - record: 'node_namespace_pod:kube_pod_info:' - - expr: |- - count by (cluster, node) ( - node_cpu_seconds_total{mode="idle",job="node-exporter"} - * on (cluster, namespace, pod) group_left(node) - topk by (cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:) - ) - record: node:node_num_cpu:sum - - expr: |- - sum( - node_memory_MemAvailable_bytes{job="node-exporter"} or - ( - node_memory_Buffers_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Slab_bytes{job="node-exporter"} - ) - ) by (cluster) - record: :node_memory_MemAvailable_bytes:sum - - expr: |- - avg by (cluster, node) ( - sum without (mode) ( - rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) - ) - ) - record: node:node_cpu_utilization:ratio_rate5m - - expr: |- - avg by (cluster) ( - node:node_cpu_utilization:ratio_rate5m - ) - record: cluster:node_cpu:ratio_rate5m ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-prometheus-operator - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors - summary: Errors while performing list operations in controller. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors - summary: Errors while performing watch operations in controller. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed - summary: Last controller reconciliation failed - expr: min_over_time(prometheus_operator_syncs{status="failed",job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors - summary: Errors while reconciling objects. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorStatusUpdateErrors - annotations: - description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorstatusupdateerrors - summary: Errors while updating objects status. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors - summary: Errors while reconciling Prometheus. - expr: rate(prometheus_operator_node_address_lookup_errors_total{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready - summary: Prometheus operator not ready - expr: min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]) == 0) - for: 5m - labels: - severity: warning - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources - summary: Resources rejected by Prometheus operator - expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="kube-prometheus-stack-1754-operator",namespace="prometheus"}[5m]) > 0 - for: 5m - labels: - severity: warning ---- -# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus - labels: - app: kube-prometheus-stack - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - groups: - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig - summary: Failed Prometheus configuration reload. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: PrometheusSDRefreshFailure - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to refresh SD with mechanism {{$labels.mechanism}}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure - summary: Failed Prometheus SD refresh. - expr: increase(prometheus_sd_refresh_failures_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[10m]) > 0 - for: 20m - labels: - severity: warning - - alert: PrometheusKubernetesListWatchFailures - annotations: - description: Kubernetes service discovery of Prometheus {{$labels.namespace}}/{{$labels.pod}} is experiencing {{ printf "%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuskuberneteslistwatchfailures - summary: Requests in Kubernetes SD are failing. - expr: increase(prometheus_sd_kubernetes_failures_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: |- - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} were affected by errors.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers - summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors. - expr: |- - ( - rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - / - rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers - summary: Prometheus is not connected to any Alertmanagers. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing - summary: Prometheus has issues reloading blocks from disk. - expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing - summary: Prometheus has issues compacting blocks. - expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples - summary: Prometheus is not ingesting samples. - expr: |- - ( - sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m])) <= 0 - and - ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}) > 0 - or - sum without(rule_group) (prometheus_rule_group_rules{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}) > 0 - ) - ) - for: 10m - labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps - summary: Prometheus is dropping samples with duplicate timestamps. - expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps - summary: Prometheus drops samples with out-of-order timestamps. - expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures - summary: Prometheus fails to send samples to remote storage. - expr: |- - ( - (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m])) - / - ( - (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m])) - + - (rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m])) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind - summary: Prometheus remote write is behind. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}` $labels.instance | query | first | value }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: |- - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures - summary: Prometheus is failing rule evaluations. - expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusLabelLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit - summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. - expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusScrapeBodySizeLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit - summary: Prometheus has dropped some targets that exceeded body size limit. - expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusScrapeSampleLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit - summary: Prometheus has failed scrapes that have exceeded the configured sample limit. - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetSyncFailure - annotations: - description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure - summary: Prometheus has failed to sync targets. - expr: increase(prometheus_target_sync_failed_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[30m]) > 0 - for: 5m - labels: - severity: critical - - alert: PrometheusHighQueryLoad - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload - summary: Prometheus is reaching its maximum capacity serving concurrent requests. - expr: avg_over_time(prometheus_engine_queries{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus"}[5m]) > 0.8 - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: |- - min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus",alertmanager!~``}[5m]) - / - rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-1754-prometheus",namespace="prometheus",alertmanager!~``}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical ---- -# Source: kube-prometheus-stack/charts/grafana/templates/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - labels: - helm.sh/chart: grafana-9.3.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "12.1.0" -spec: - endpoints: - - port: http-web - scrapeTimeout: 30s - honorLabels: true - path: /metrics - scheme: http - jobLabel: "kube-prometheus-stack-1754164871" - selector: - matchLabels: - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - namespaceSelector: - matchNames: - - prometheus ---- -# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754164871-kube-state-metrics - namespace: prometheus - labels: - helm.sh/chart: kube-state-metrics-6.1.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "2.16.0" - release: kube-prometheus-stack-1754164871 -spec: - jobLabel: app.kubernetes.io/name - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - endpoints: - - port: http - honorLabels: true ---- -# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - namespace: prometheus - labels: - helm.sh/chart: prometheus-node-exporter-4.47.3 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "1.9.1" - release: kube-prometheus-stack-1754164871 -spec: - jobLabel: jobLabel - - selector: - matchLabels: - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - attachMetadata: - node: false - endpoints: - - port: http-metrics - scheme: http ---- -# Source: kube-prometheus-stack/templates/alertmanager/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-alertmanager - namespace: prometheus - labels: - app: kube-prometheus-stack-alertmanager - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - - selector: - matchLabels: - app: kube-prometheus-stack-alertmanager - release: "kube-prometheus-stack-1754164871" - self-monitor: "true" - namespaceSelector: - matchNames: - - "prometheus" - endpoints: - - port: http-web - enableHttp2: true - path: "/metrics" - - port: reloader-web - path: "/metrics" ---- -# Source: kube-prometheus-stack/templates/exporters/core-dns/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-coredns - namespace: prometheus - labels: - app: kube-prometheus-stack-coredns - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - jobLabel: jobLabel - - selector: - matchLabels: - app: kube-prometheus-stack-coredns - release: "kube-prometheus-stack-1754164871" - namespaceSelector: - matchNames: - - "kube-system" - endpoints: - - port: http-metrics - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ---- -# Source: kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-apiserver - namespace: prometheus - labels: - app: kube-prometheus-stack-apiserver - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - port: https - scheme: https - metricRelabelings: - - action: drop - regex: (etcd_request|apiserver_request_slo|apiserver_request_sli|apiserver_request)_duration_seconds_bucket;(0\.15|0\.2|0\.3|0\.35|0\.4|0\.45|0\.6|0\.7|0\.8|0\.9|1\.25|1\.5|1\.75|2|3|3\.5|4|4\.5|6|7|8|9|15|20|40|45|50)(\.0)? - sourceLabels: - - __name__ - - le - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - serverName: kubernetes - insecureSkipVerify: false - jobLabel: component - namespaceSelector: - matchNames: - - default - selector: - matchLabels: - component: apiserver - provider: kubernetes ---- -# Source: kube-prometheus-stack/templates/exporters/kube-controller-manager/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-kube-controller-manager - namespace: prometheus - labels: - app: kube-prometheus-stack-kube-controller-manager - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - jobLabel: jobLabel - - selector: - matchLabels: - app: kube-prometheus-stack-kube-controller-manager - release: "kube-prometheus-stack-1754164871" - namespaceSelector: - matchNames: - - "kube-system" - endpoints: - - port: http-metrics - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecureSkipVerify: true ---- -# Source: kube-prometheus-stack/templates/exporters/kube-etcd/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-kube-etcd - namespace: prometheus - labels: - app: kube-prometheus-stack-kube-etcd - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - jobLabel: jobLabel - - selector: - matchLabels: - app: kube-prometheus-stack-kube-etcd - release: "kube-prometheus-stack-1754164871" - namespaceSelector: - matchNames: - - "kube-system" - endpoints: - - port: http-metrics - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ---- -# Source: kube-prometheus-stack/templates/exporters/kube-proxy/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-kube-proxy - namespace: prometheus - labels: - app: kube-prometheus-stack-kube-proxy - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - jobLabel: jobLabel - - selector: - matchLabels: - app: kube-prometheus-stack-kube-proxy - release: "kube-prometheus-stack-1754164871" - namespaceSelector: - matchNames: - - "kube-system" - endpoints: - - port: http-metrics - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ---- -# Source: kube-prometheus-stack/templates/exporters/kube-scheduler/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-kube-scheduler - namespace: prometheus - labels: - app: kube-prometheus-stack-kube-scheduler - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - jobLabel: jobLabel - - selector: - matchLabels: - app: kube-prometheus-stack-kube-scheduler - release: "kube-prometheus-stack-1754164871" - namespaceSelector: - matchNames: - - "kube-system" - endpoints: - - port: http-metrics - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecureSkipVerify: true ---- -# Source: kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-kubelet - namespace: prometheus - labels: - app: kube-prometheus-stack-kubelet - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - - attachMetadata: - node: false - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - app.kubernetes.io/name: kubelet - k8s-app: kubelet - endpoints: - - port: https-metrics - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - honorTimestamps: true - metricRelabelings: - - action: drop - regex: (csi_operations|storage_operation_duration)_seconds_bucket;(0.25|2.5|15|25|120|600)(\.0)? - sourceLabels: - - __name__ - - le - relabelings: - - action: replace - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - - port: https-metrics - scheme: https - path: /metrics/cadvisor - interval: 10s - honorLabels: true - honorTimestamps: true - trackTimestampsStaleness: true - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - metricRelabelings: - - action: drop - regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total) - sourceLabels: - - __name__ - - action: drop - regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total) - sourceLabels: - - __name__ - - action: drop - regex: container_memory_(mapped_file|swap) - sourceLabels: - - __name__ - - action: drop - regex: container_(file_descriptors|tasks_state|threads_max) - sourceLabels: - - __name__ - - action: drop - regex: container_memory_failures_total;hierarchy - sourceLabels: - - __name__ - - scope - - action: drop - regex: container_network_.*;(cali|cilium|cni|lxc|nodelocaldns|tunl).* - sourceLabels: - - __name__ - - interface - - action: drop - regex: container_spec.* - sourceLabels: - - __name__ - - action: drop - regex: .+; - sourceLabels: - - id - - pod - relabelings: - - action: replace - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - - port: https-metrics - scheme: https - path: /metrics/probes - honorLabels: true - honorTimestamps: true - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - relabelings: - - action: replace - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-operator - namespace: prometheus - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app: kube-prometheus-stack-operator - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator -spec: - - endpoints: - - port: https - scheme: https - tlsConfig: - serverName: kube-prometheus-stack-1754-operator - ca: - secret: - name: kube-prometheus-stack-1754-admission - key: ca - optional: false - honorLabels: true - selector: - matchLabels: - app: kube-prometheus-stack-operator - release: "kube-prometheus-stack-1754164871" - namespaceSelector: - matchNames: - - "prometheus" ---- -# Source: kube-prometheus-stack/templates/prometheus/servicemonitor.yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus - labels: - app: kube-prometheus-stack-prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" -spec: - - selector: - matchLabels: - app: kube-prometheus-stack-prometheus - release: "kube-prometheus-stack-1754164871" - self-monitor: "true" - namespaceSelector: - matchNames: - - "prometheus" - endpoints: - - port: http-web - path: "/metrics" - - port: reloader-web - path: "/metrics" ---- -# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: kube-prometheus-stack-1754-admission - annotations: - - labels: - app: kube-prometheus-stack-admission - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/version: "75.15.1" - app.kubernetes.io/part-of: kube-prometheus-stack - chart: kube-prometheus-stack-75.15.1 - release: "kube-prometheus-stack-1754164871" - heritage: "Helm" - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/component: prometheus-operator-webhook -webhooks: - - name: prometheusrulemutate.monitoring.coreos.com - failurePolicy: Ignore - rules: - - apiGroups: - - monitoring.coreos.com - apiVersions: - - "*" - resources: - - prometheusrules - operations: - - CREATE - - UPDATE - clientConfig: - service: - namespace: prometheus - name: kube-prometheus-stack-1754-operator - path: /admission-prometheusrules/validate - timeoutSeconds: 10 - admissionReviewVersions: ["v1", "v1beta1"] - sideEffects: None - diff --git a/docs/source/distributions/k8s/all-values.yaml b/docs/source/distributions/k8s/prometheus-value.yaml similarity index 100% rename from docs/source/distributions/k8s/all-values.yaml rename to docs/source/distributions/k8s/prometheus-value.yaml diff --git a/docs/source/distributions/k8s/svc-and-pods.yaml b/docs/source/distributions/k8s/svc-and-pods.yaml deleted file mode 100644 index 51ca62fa1..000000000 --- a/docs/source/distributions/k8s/svc-and-pods.yaml +++ /dev/null @@ -1,2274 +0,0 @@ -apiVersion: v1 -items: -- apiVersion: v1 - kind: Service - metadata: - creationTimestamp: "2025-08-02T20:01:52Z" - labels: - managed-by: prometheus-operator - operated-alertmanager: "true" - name: alertmanager-operated - namespace: prometheus - ownerReferences: - - apiVersion: monitoring.coreos.com/v1 - kind: Alertmanager - name: kube-prometheus-stack-1754-alertmanager - uid: 37e64ef7-78b0-485a-9c86-e4e83188d868 - resourceVersion: "93640374" - uid: 9a4a4644-e7fd-40b5-b14b-fb0055e37cee - spec: - clusterIP: None - clusterIPs: - - None - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http-web - port: 9093 - protocol: TCP - targetPort: http-web - - name: tcp-mesh - port: 9094 - protocol: TCP - targetPort: 9094 - - name: udp-mesh - port: 9094 - protocol: UDP - targetPort: 9094 - publishNotReadyAddresses: true - selector: - app.kubernetes.io/name: alertmanager - sessionAffinity: None - type: ClusterIP - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - annotations: - meta.helm.sh/release-name: kube-prometheus-stack-1754164871 - meta.helm.sh/release-namespace: prometheus - creationTimestamp: "2025-08-02T20:01:49Z" - labels: - app: kube-prometheus-stack-alertmanager - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/part-of: kube-prometheus-stack - app.kubernetes.io/version: 75.15.1 - chart: kube-prometheus-stack-75.15.1 - heritage: Helm - release: kube-prometheus-stack-1754164871 - self-monitor: "true" - name: kube-prometheus-stack-1754-alertmanager - namespace: prometheus - resourceVersion: "93640183" - uid: 552a7228-3b1d-4dd8-abd7-f491f365a295 - spec: - clusterIP: 10.100.154.228 - clusterIPs: - - 10.100.154.228 - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http-web - port: 9093 - protocol: TCP - targetPort: 9093 - - appProtocol: http - name: reloader-web - port: 8080 - protocol: TCP - targetPort: reloader-web - selector: - alertmanager: kube-prometheus-stack-1754-alertmanager - app.kubernetes.io/name: alertmanager - sessionAffinity: None - type: ClusterIP - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - annotations: - meta.helm.sh/release-name: kube-prometheus-stack-1754164871 - meta.helm.sh/release-namespace: prometheus - creationTimestamp: "2025-08-02T20:01:49Z" - labels: - app: kube-prometheus-stack-operator - app.kubernetes.io/component: prometheus-operator - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/part-of: kube-prometheus-stack - app.kubernetes.io/version: 75.15.1 - chart: kube-prometheus-stack-75.15.1 - heritage: Helm - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754-operator - namespace: prometheus - resourceVersion: "93640195" - uid: 240740e2-b4d2-4a14-a52d-7f73a93256c0 - spec: - clusterIP: 10.100.175.188 - clusterIPs: - - 10.100.175.188 - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: https - port: 443 - protocol: TCP - targetPort: https - selector: - app: kube-prometheus-stack-operator - release: kube-prometheus-stack-1754164871 - sessionAffinity: None - type: ClusterIP - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - annotations: - meta.helm.sh/release-name: kube-prometheus-stack-1754164871 - meta.helm.sh/release-namespace: prometheus - creationTimestamp: "2025-08-02T20:01:49Z" - labels: - app: kube-prometheus-stack-prometheus - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/part-of: kube-prometheus-stack - app.kubernetes.io/version: 75.15.1 - chart: kube-prometheus-stack-75.15.1 - heritage: Helm - release: kube-prometheus-stack-1754164871 - self-monitor: "true" - name: kube-prometheus-stack-1754-prometheus - namespace: prometheus - resourceVersion: "93640192" - uid: ce5f1fd7-e1a7-4656-b01b-1abf881246bd - spec: - clusterIP: 10.100.64.119 - clusterIPs: - - 10.100.64.119 - externalTrafficPolicy: Cluster - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http-web - nodePort: 30090 - port: 9090 - protocol: TCP - targetPort: 9090 - - appProtocol: http - name: reloader-web - nodePort: 30588 - port: 8080 - protocol: TCP - targetPort: reloader-web - selector: - app.kubernetes.io/name: prometheus - operator.prometheus.io/name: kube-prometheus-stack-1754-prometheus - sessionAffinity: None - type: NodePort - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - annotations: - meta.helm.sh/release-name: kube-prometheus-stack-1754164871 - meta.helm.sh/release-namespace: prometheus - creationTimestamp: "2025-08-02T20:01:49Z" - labels: - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: grafana - app.kubernetes.io/version: 12.1.0 - helm.sh/chart: grafana-9.3.0 - name: kube-prometheus-stack-1754164871-grafana - namespace: prometheus - resourceVersion: "93643729" - uid: 49a69113-710e-4750-aa03-7c72e1306cf8 - spec: - clusterIP: 10.100.18.230 - clusterIPs: - - 10.100.18.230 - externalTrafficPolicy: Cluster - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http-web - nodePort: 31509 - port: 80 - protocol: TCP - targetPort: 3000 - selector: - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/name: grafana - sessionAffinity: None - type: NodePort - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - annotations: - meta.helm.sh/release-name: kube-prometheus-stack-1754164871 - meta.helm.sh/release-namespace: prometheus - creationTimestamp: "2025-08-02T20:01:49Z" - labels: - app.kubernetes.io/component: metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/version: 2.16.0 - helm.sh/chart: kube-state-metrics-6.1.0 - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-kube-state-metrics - namespace: prometheus - resourceVersion: "93640202" - uid: 3fcd1cf8-7218-4624-bcde-563ef0b4c809 - spec: - clusterIP: 10.100.158.39 - clusterIPs: - - 10.100.158.39 - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http - port: 8080 - protocol: TCP - targetPort: 8080 - selector: - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/name: kube-state-metrics - sessionAffinity: None - type: ClusterIP - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - annotations: - meta.helm.sh/release-name: kube-prometheus-stack-1754164871 - meta.helm.sh/release-namespace: prometheus - prometheus.io/scrape: "true" - creationTimestamp: "2025-08-02T20:01:49Z" - labels: - app.kubernetes.io/component: metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/version: 1.9.1 - helm.sh/chart: prometheus-node-exporter-4.47.3 - jobLabel: node-exporter - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - namespace: prometheus - resourceVersion: "93640189" - uid: 36ceaa10-3544-4d2a-92bb-9d839a0a0dbc - spec: - clusterIP: 10.100.153.18 - clusterIPs: - - 10.100.153.18 - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http-metrics - port: 9100 - protocol: TCP - targetPort: 9100 - selector: - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/name: prometheus-node-exporter - sessionAffinity: None - type: ClusterIP - status: - loadBalancer: {} -- apiVersion: v1 - kind: Service - metadata: - creationTimestamp: "2025-08-02T20:01:52Z" - labels: - managed-by: prometheus-operator - operated-prometheus: "true" - name: prometheus-operated - namespace: prometheus - ownerReferences: - - apiVersion: monitoring.coreos.com/v1 - kind: Prometheus - name: kube-prometheus-stack-1754-prometheus - uid: f76b5a30-c66c-4d0b-b4ec-0854154752ba - resourceVersion: "93640390" - uid: c98da91f-e91c-454c-b8be-ac5471bd94a0 - spec: - clusterIP: None - clusterIPs: - - None - internalTrafficPolicy: Cluster - ipFamilies: - - IPv4 - ipFamilyPolicy: SingleStack - ports: - - name: http-web - port: 9090 - protocol: TCP - targetPort: http-web - selector: - app.kubernetes.io/name: prometheus - sessionAffinity: None - type: ClusterIP - status: - loadBalancer: {} -- apiVersion: v1 - kind: Pod - metadata: - annotations: - kubectl.kubernetes.io/default-container: alertmanager - creationTimestamp: "2025-08-02T20:01:52Z" - generateName: alertmanager-kube-prometheus-stack-1754-alertmanager- - labels: - alertmanager: kube-prometheus-stack-1754-alertmanager - app.kubernetes.io/instance: kube-prometheus-stack-1754-alertmanager - app.kubernetes.io/managed-by: prometheus-operator - app.kubernetes.io/name: alertmanager - app.kubernetes.io/version: 0.28.1 - apps.kubernetes.io/pod-index: "0" - controller-revision-hash: alertmanager-kube-prometheus-stack-1754-alertmanager-7677c479d - statefulset.kubernetes.io/pod-name: alertmanager-kube-prometheus-stack-1754-alertmanager-0 - name: alertmanager-kube-prometheus-stack-1754-alertmanager-0 - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: StatefulSet - name: alertmanager-kube-prometheus-stack-1754-alertmanager - uid: 5f535039-e58c-49d0-972f-e1c916e59fe2 - resourceVersion: "93640559" - uid: 87515b5e-8797-423e-ad3a-475082c2f928 - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - alertmanager - - key: alertmanager - operator: In - values: - - kube-prometheus-stack-1754-alertmanager - topologyKey: kubernetes.io/hostname - weight: 100 - automountServiceAccountToken: true - containers: - - args: - - --config.file=/etc/alertmanager/config_out/alertmanager.env.yaml - - --storage.path=/alertmanager - - --data.retention=120h - - --cluster.listen-address= - - --web.listen-address=:9093 - - --web.external-url=http://kube-prometheus-stack-1754-alertmanager.prometheus:9093 - - --web.route-prefix=/ - - --cluster.label=prometheus/kube-prometheus-stack-1754-alertmanager - - --cluster.peer=alertmanager-kube-prometheus-stack-1754-alertmanager-0.alertmanager-operated:9094 - - --cluster.reconnect-timeout=5m - - --web.config.file=/etc/alertmanager/web_config/web-config.yaml - env: - - name: POD_IP - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - image: quay.io/prometheus/alertmanager:v0.28.1 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 10 - httpGet: - path: /-/healthy - port: http-web - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 3 - name: alertmanager - ports: - - containerPort: 9093 - name: http-web - protocol: TCP - - containerPort: 9094 - name: mesh-tcp - protocol: TCP - - containerPort: 9094 - name: mesh-udp - protocol: UDP - readinessProbe: - failureThreshold: 10 - httpGet: - path: /-/ready - port: http-web - scheme: HTTP - initialDelaySeconds: 3 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 3 - resources: - requests: - memory: 200Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/alertmanager/config - name: config-volume - - mountPath: /etc/alertmanager/config_out - name: config-out - readOnly: true - - mountPath: /etc/alertmanager/certs - name: tls-assets - readOnly: true - - mountPath: /alertmanager - name: alertmanager-kube-prometheus-stack-1754-alertmanager-db - - mountPath: /etc/alertmanager/web_config/web-config.yaml - name: web-config - readOnly: true - subPath: web-config.yaml - - mountPath: /etc/alertmanager/cluster_tls_config/cluster-tls-config.yaml - name: cluster-tls-config - readOnly: true - subPath: cluster-tls-config.yaml - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-nl55p - readOnly: true - - args: - - --listen-address=:8080 - - --web-config-file=/etc/alertmanager/web_config/web-config.yaml - - --reload-url=http://127.0.0.1:9093/-/reload - - --config-file=/etc/alertmanager/config/alertmanager.yaml.gz - - --config-envsubst-file=/etc/alertmanager/config_out/alertmanager.env.yaml - - --watched-dir=/etc/alertmanager/config - command: - - /bin/prometheus-config-reloader - env: - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: SHARD - value: "-1" - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imagePullPolicy: IfNotPresent - name: config-reloader - ports: - - containerPort: 8080 - name: reloader-web - protocol: TCP - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/alertmanager/config - name: config-volume - readOnly: true - - mountPath: /etc/alertmanager/config_out - name: config-out - - mountPath: /etc/alertmanager/web_config/web-config.yaml - name: web-config - readOnly: true - subPath: web-config.yaml - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-nl55p - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - hostname: alertmanager-kube-prometheus-stack-1754-alertmanager-0 - initContainers: - - args: - - --watch-interval=0 - - --listen-address=:8081 - - --config-file=/etc/alertmanager/config/alertmanager.yaml.gz - - --config-envsubst-file=/etc/alertmanager/config_out/alertmanager.env.yaml - - --watched-dir=/etc/alertmanager/config - command: - - /bin/prometheus-config-reloader - env: - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: SHARD - value: "-1" - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imagePullPolicy: IfNotPresent - name: init-config-reloader - ports: - - containerPort: 8081 - name: reloader-web - protocol: TCP - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/alertmanager/config - name: config-volume - readOnly: true - - mountPath: /etc/alertmanager/config_out - name: config-out - - mountPath: /etc/alertmanager/web_config/web-config.yaml - name: web-config - readOnly: true - subPath: web-config.yaml - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-nl55p - readOnly: true - nodeName: ip-172-33-32-41.us-west-2.compute.internal - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 2000 - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault - serviceAccount: kube-prometheus-stack-1754-alertmanager - serviceAccountName: kube-prometheus-stack-1754-alertmanager - subdomain: alertmanager-operated - terminationGracePeriodSeconds: 120 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 300 - volumes: - - name: config-volume - secret: - defaultMode: 420 - secretName: alertmanager-kube-prometheus-stack-1754-alertmanager-generated - - name: tls-assets - projected: - defaultMode: 420 - sources: - - secret: - name: alertmanager-kube-prometheus-stack-1754-alertmanager-tls-assets-0 - - emptyDir: - medium: Memory - name: config-out - - name: web-config - secret: - defaultMode: 420 - secretName: alertmanager-kube-prometheus-stack-1754-alertmanager-web-config - - name: cluster-tls-config - secret: - defaultMode: 420 - secretName: alertmanager-kube-prometheus-stack-1754-alertmanager-cluster-tls-config - - emptyDir: {} - name: alertmanager-kube-prometheus-stack-1754-alertmanager-db - - name: kube-api-access-nl55p - projected: - defaultMode: 420 - sources: - - serviceAccountToken: - expirationSeconds: 3607 - path: token - - configMap: - items: - - key: ca.crt - path: ca.crt - name: kube-root-ca.crt - - downwardAPI: - items: - - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - path: namespace - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:54Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:56Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:02Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:02Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:52Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://194f0a60e4b5062f6a7cf9a06906132d0b0edb5971121ac2374ffd9e4ab189d9 - image: quay.io/prometheus/alertmanager:v0.28.1 - imageID: quay.io/prometheus/alertmanager@sha256:27c475db5fb156cab31d5c18a4251ac7ed567746a2483ff264516437a39b15ba - lastState: {} - name: alertmanager - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:58Z" - volumeMounts: - - mountPath: /etc/alertmanager/config - name: config-volume - - mountPath: /etc/alertmanager/config_out - name: config-out - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /etc/alertmanager/certs - name: tls-assets - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /alertmanager - name: alertmanager-kube-prometheus-stack-1754-alertmanager-db - - mountPath: /etc/alertmanager/web_config/web-config.yaml - name: web-config - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /etc/alertmanager/cluster_tls_config/cluster-tls-config.yaml - name: cluster-tls-config - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-nl55p - readOnly: true - recursiveReadOnly: Disabled - - containerID: containerd://c2bb73168093a0fcd50da24d316874aa1190fcc36f21f61581cd404dc1c3b79f - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imageID: quay.io/prometheus-operator/prometheus-config-reloader@sha256:78aec597d87aa2b4ba947ab9190538dae93a58a67b8e930aefea1086534b02ef - lastState: {} - name: config-reloader - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:58Z" - volumeMounts: - - mountPath: /etc/alertmanager/config - name: config-volume - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /etc/alertmanager/config_out - name: config-out - - mountPath: /etc/alertmanager/web_config/web-config.yaml - name: web-config - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-nl55p - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.32.41 - hostIPs: - - ip: 172.33.32.41 - initContainerStatuses: - - containerID: containerd://f8cac5cdbb6a72d8c8a85888059863eb716d42c3a090588042418e233f67d027 - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imageID: quay.io/prometheus-operator/prometheus-config-reloader@sha256:78aec597d87aa2b4ba947ab9190538dae93a58a67b8e930aefea1086534b02ef - lastState: {} - name: init-config-reloader - ready: true - restartCount: 0 - started: false - state: - terminated: - containerID: containerd://f8cac5cdbb6a72d8c8a85888059863eb716d42c3a090588042418e233f67d027 - exitCode: 0 - finishedAt: "2025-08-02T20:01:54Z" - reason: Completed - startedAt: "2025-08-02T20:01:54Z" - volumeMounts: - - mountPath: /etc/alertmanager/config - name: config-volume - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /etc/alertmanager/config_out - name: config-out - - mountPath: /etc/alertmanager/web_config/web-config.yaml - name: web-config - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-nl55p - readOnly: true - recursiveReadOnly: Disabled - phase: Running - podIP: 172.33.46.59 - podIPs: - - ip: 172.33.46.59 - qosClass: Burstable - startTime: "2025-08-02T20:01:52Z" -- apiVersion: v1 - kind: Pod - metadata: - creationTimestamp: "2025-08-02T20:01:49Z" - generateName: kube-prometheus-stack-1754-operator-57d8448ffc- - labels: - app: kube-prometheus-stack-operator - app.kubernetes.io/component: prometheus-operator - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: kube-prometheus-stack-prometheus-operator - app.kubernetes.io/part-of: kube-prometheus-stack - app.kubernetes.io/version: 75.15.1 - chart: kube-prometheus-stack-75.15.1 - heritage: Helm - pod-template-hash: 57d8448ffc - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754-operator-57d8448ffc-zkxrs - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: ReplicaSet - name: kube-prometheus-stack-1754-operator-57d8448ffc - uid: f323c763-7775-4cb7-af35-996f6dbe58c5 - resourceVersion: "93640436" - uid: 7920b221-bd8e-4b25-984a-c4fe4ffff94d - spec: - automountServiceAccountToken: true - containers: - - args: - - --kubelet-service=kube-system/kube-prometheus-stack-1754-kubelet - - --kubelet-endpoints=true - - --kubelet-endpointslice=false - - --localhost=127.0.0.1 - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - - --config-reloader-cpu-request=0 - - --config-reloader-cpu-limit=0 - - --config-reloader-memory-request=0 - - --config-reloader-memory-limit=0 - - --thanos-default-base-image=quay.io/thanos/thanos:v0.39.2 - - --secret-field-selector=type!=kubernetes.io/dockercfg,type!=kubernetes.io/service-account-token,type!=helm.sh/release.v1 - - --web.enable-tls=true - - --web.cert-file=/cert/cert - - --web.key-file=/cert/key - - --web.listen-address=:10250 - - --web.tls-min-version=VersionTLS13 - env: - - name: GOGC - value: "30" - image: quay.io/prometheus-operator/prometheus-operator:v0.83.0 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: https - scheme: HTTPS - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - name: kube-prometheus-stack - ports: - - containerPort: 10250 - name: https - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: https - scheme: HTTPS - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /cert - name: tls-secret - readOnly: true - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-7qbvh - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - nodeName: ip-172-33-32-41.us-west-2.compute.internal - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - serviceAccount: kube-prometheus-stack-1754-operator - serviceAccountName: kube-prometheus-stack-1754-operator - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 300 - volumes: - - name: tls-secret - secret: - defaultMode: 420 - secretName: kube-prometheus-stack-1754-admission - - name: kube-api-access-7qbvh - projected: - defaultMode: 420 - sources: - - serviceAccountToken: - expirationSeconds: 3607 - path: token - - configMap: - items: - - key: ca.crt - path: ca.crt - name: kube-root-ca.crt - - downwardAPI: - items: - - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - path: namespace - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:52Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:53Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:53Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://072cc3d39cda99ab1807a732bb2524dddefb0f8f1386f0aebbffe5a4ec9a1283 - image: quay.io/prometheus-operator/prometheus-operator:v0.83.0 - imageID: quay.io/prometheus-operator/prometheus-operator@sha256:b6a89b8ec08f4cca759b2d579e8545f97ffb897973fcd68148c153f2e936c8b3 - lastState: {} - name: kube-prometheus-stack - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:51Z" - volumeMounts: - - mountPath: /cert - name: tls-secret - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-7qbvh - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.32.41 - hostIPs: - - ip: 172.33.32.41 - phase: Running - podIP: 172.33.54.157 - podIPs: - - ip: 172.33.54.157 - qosClass: BestEffort - startTime: "2025-08-02T20:01:49Z" -- apiVersion: v1 - kind: Pod - metadata: - annotations: - checksum/config: 0e9cbd0ea8e24e32f7dfca5bab17a2ba05652642f0a09a4882833ae88e4cc4a3 - checksum/sc-dashboard-provider-config: e70bf6a851099d385178a76de9757bb0bef8299da6d8443602590e44f05fdf24 - checksum/secret: 032056e9c62bbe9d1daa41ee49cd3d9524c076f51ca4c65adadf4ef08ef28712 - kubectl.kubernetes.io/default-container: grafana - creationTimestamp: "2025-08-02T20:01:49Z" - generateName: kube-prometheus-stack-1754164871-grafana-85445fd75f- - labels: - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/name: grafana - app.kubernetes.io/version: 12.1.0 - helm.sh/chart: grafana-9.3.0 - pod-template-hash: 85445fd75f - name: kube-prometheus-stack-1754164871-grafana-85445fd75f-4sc65 - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: ReplicaSet - name: kube-prometheus-stack-1754164871-grafana-85445fd75f - uid: 8dd3307e-8b75-47c7-ac66-e03eb22764f5 - resourceVersion: "93640642" - uid: c49e9c23-10b3-4c45-b6de-a9d388d3022a - spec: - automountServiceAccountToken: true - containers: - - env: - - name: METHOD - value: WATCH - - name: LABEL - value: grafana_dashboard - - name: LABEL_VALUE - value: "1" - - name: FOLDER - value: /tmp/dashboards - - name: RESOURCE - value: both - - name: NAMESPACE - value: ALL - - name: REQ_USERNAME - valueFrom: - secretKeyRef: - key: admin-user - name: kube-prometheus-stack-1754164871-grafana - - name: REQ_PASSWORD - valueFrom: - secretKeyRef: - key: admin-password - name: kube-prometheus-stack-1754164871-grafana - - name: REQ_URL - value: http://localhost:3000/api/admin/provisioning/dashboards/reload - - name: REQ_METHOD - value: POST - image: quay.io/kiwigrid/k8s-sidecar:1.30.3 - imagePullPolicy: IfNotPresent - name: grafana-sc-dashboard - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /tmp/dashboards - name: sc-dashboard-volume - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-cdhhf - readOnly: true - - env: - - name: METHOD - value: WATCH - - name: LABEL - value: grafana_datasource - - name: LABEL_VALUE - value: "1" - - name: FOLDER - value: /etc/grafana/provisioning/datasources - - name: RESOURCE - value: both - - name: REQ_USERNAME - valueFrom: - secretKeyRef: - key: admin-user - name: kube-prometheus-stack-1754164871-grafana - - name: REQ_PASSWORD - valueFrom: - secretKeyRef: - key: admin-password - name: kube-prometheus-stack-1754164871-grafana - - name: REQ_URL - value: http://localhost:3000/api/admin/provisioning/datasources/reload - - name: REQ_METHOD - value: POST - image: quay.io/kiwigrid/k8s-sidecar:1.30.3 - imagePullPolicy: IfNotPresent - name: grafana-sc-datasources - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /etc/grafana/provisioning/datasources - name: sc-datasources-volume - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-cdhhf - readOnly: true - - env: - - name: POD_IP - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - key: admin-user - name: kube-prometheus-stack-1754164871-grafana - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - key: admin-password - name: kube-prometheus-stack-1754164871-grafana - - name: GF_PATHS_DATA - value: /var/lib/grafana/ - - name: GF_PATHS_LOGS - value: /var/log/grafana - - name: GF_PATHS_PLUGINS - value: /var/lib/grafana/plugins - - name: GF_PATHS_PROVISIONING - value: /etc/grafana/provisioning - image: docker.io/grafana/grafana:12.1.0 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 10 - httpGet: - path: /api/health - port: 3000 - scheme: HTTP - initialDelaySeconds: 60 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 30 - name: grafana - ports: - - containerPort: 3000 - name: grafana - protocol: TCP - - containerPort: 9094 - name: gossip-tcp - protocol: TCP - - containerPort: 9094 - name: gossip-udp - protocol: UDP - - containerPort: 6060 - name: profiling - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /api/health - port: 3000 - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /etc/grafana/grafana.ini - name: config - subPath: grafana.ini - - mountPath: /var/lib/grafana - name: storage - - mountPath: /tmp/dashboards - name: sc-dashboard-volume - - mountPath: /etc/grafana/provisioning/dashboards/sc-dashboardproviders.yaml - name: sc-dashboard-provider - subPath: provider.yaml - - mountPath: /etc/grafana/provisioning/datasources - name: sc-datasources-volume - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-cdhhf - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - nodeName: ip-172-33-32-41.us-west-2.compute.internal - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 472 - runAsGroup: 472 - runAsNonRoot: true - runAsUser: 472 - serviceAccount: kube-prometheus-stack-1754164871-grafana - serviceAccountName: kube-prometheus-stack-1754164871-grafana - shareProcessNamespace: false - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 300 - volumes: - - configMap: - defaultMode: 420 - name: kube-prometheus-stack-1754164871-grafana - name: config - - emptyDir: {} - name: storage - - emptyDir: {} - name: sc-dashboard-volume - - configMap: - defaultMode: 420 - name: kube-prometheus-stack-1754164871-grafana-config-dashboards - name: sc-dashboard-provider - - emptyDir: {} - name: sc-datasources-volume - - name: kube-api-access-cdhhf - projected: - defaultMode: 420 - sources: - - serviceAccountToken: - expirationSeconds: 3607 - path: token - - configMap: - items: - - key: ca.crt - path: ca.crt - name: kube-root-ca.crt - - downwardAPI: - items: - - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - path: namespace - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:02Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:09Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:09Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://114d989a3b87c0a9005a04b7661adf5ba4be8392d2dd2571d00e5c3450c38d6f - image: docker.io/grafana/grafana:12.1.0 - imageID: docker.io/grafana/grafana@sha256:6ac590e7cabc2fbe8d7b8fc1ce9c9f0582177b334e0df9c927ebd9670469440f - lastState: {} - name: grafana - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:02:02Z" - volumeMounts: - - mountPath: /etc/grafana/grafana.ini - name: config - - mountPath: /var/lib/grafana - name: storage - - mountPath: /tmp/dashboards - name: sc-dashboard-volume - - mountPath: /etc/grafana/provisioning/dashboards/sc-dashboardproviders.yaml - name: sc-dashboard-provider - - mountPath: /etc/grafana/provisioning/datasources - name: sc-datasources-volume - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-cdhhf - readOnly: true - recursiveReadOnly: Disabled - - containerID: containerd://e640a53015faae291d56c699e4f0ec4862fba0a0e950eca4fa9ae7ec11253882 - image: quay.io/kiwigrid/k8s-sidecar:1.30.3 - imageID: quay.io/kiwigrid/k8s-sidecar@sha256:49dcce269568b1645b0050f296da787c99119647965229919a136614123f0627 - lastState: {} - name: grafana-sc-dashboard - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:52Z" - volumeMounts: - - mountPath: /tmp/dashboards - name: sc-dashboard-volume - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-cdhhf - readOnly: true - recursiveReadOnly: Disabled - - containerID: containerd://a56f9ccb98959cb6b8918592a1a7025d6711b7940fff48b5461a401812cbe124 - image: quay.io/kiwigrid/k8s-sidecar:1.30.3 - imageID: quay.io/kiwigrid/k8s-sidecar@sha256:49dcce269568b1645b0050f296da787c99119647965229919a136614123f0627 - lastState: {} - name: grafana-sc-datasources - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:52Z" - volumeMounts: - - mountPath: /etc/grafana/provisioning/datasources - name: sc-datasources-volume - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-cdhhf - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.32.41 - hostIPs: - - ip: 172.33.32.41 - phase: Running - podIP: 172.33.42.38 - podIPs: - - ip: 172.33.42.38 - qosClass: BestEffort - startTime: "2025-08-02T20:01:49Z" -- apiVersion: v1 - kind: Pod - metadata: - creationTimestamp: "2025-08-02T20:01:49Z" - generateName: kube-prometheus-stack-1754164871-kube-state-metrics-6cbcf68db6- - labels: - app.kubernetes.io/component: metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/version: 2.16.0 - helm.sh/chart: kube-state-metrics-6.1.0 - pod-template-hash: 6cbcf68db6 - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-kube-state-metrics-6cbcf6r26h2 - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: ReplicaSet - name: kube-prometheus-stack-1754164871-kube-state-metrics-6cbcf68db6 - uid: 9608401d-ae24-4d58-9241-78edbe7db9c5 - resourceVersion: "93640531" - uid: 6e217ae5-4616-4431-97dc-3636cd49d37a - spec: - automountServiceAccountToken: true - containers: - - args: - - --port=8080 - - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.16.0 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 3 - httpGet: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - name: kube-state-metrics - ports: - - containerPort: 8080 - name: http - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-4htkb - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - nodeName: ip-172-33-32-41.us-west-2.compute.internal - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - serviceAccount: kube-prometheus-stack-1754164871-kube-state-metrics - serviceAccountName: kube-prometheus-stack-1754164871-kube-state-metrics - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 300 - volumes: - - name: kube-api-access-4htkb - projected: - defaultMode: 420 - sources: - - serviceAccountToken: - expirationSeconds: 3607 - path: token - - configMap: - items: - - key: ca.crt - path: ca.crt - name: kube-root-ca.crt - - downwardAPI: - items: - - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - path: namespace - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:51Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:59Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:59Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://9b193660e0e30060da4ef2c1df59c02f7d8a26ddbdd4754d2f70c002b162ddca - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.16.0 - imageID: registry.k8s.io/kube-state-metrics/kube-state-metrics@sha256:e750cd4b43f782e3106537026c2995cac85d921aedea334e1d16caad7877c360 - lastState: {} - name: kube-state-metrics - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:50Z" - volumeMounts: - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-4htkb - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.32.41 - hostIPs: - - ip: 172.33.32.41 - phase: Running - podIP: 172.33.53.196 - podIPs: - - ip: 172.33.53.196 - qosClass: BestEffort - startTime: "2025-08-02T20:01:49Z" -- apiVersion: v1 - kind: Pod - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - creationTimestamp: "2025-08-02T20:01:49Z" - generateName: kube-prometheus-stack-1754164871-prometheus-node-exporter- - labels: - app.kubernetes.io/component: metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/version: 1.9.1 - controller-revision-hash: 8587f5b7cd - helm.sh/chart: prometheus-node-exporter-4.47.3 - jobLabel: node-exporter - pod-template-generation: "1" - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-prometheus-node-exporter-6cr5d - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: DaemonSet - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - uid: cc472f6b-fe45-47cb-a915-52e316494b41 - resourceVersion: "93640385" - uid: a815da5b-4963-4286-8d07-8eb975a37f1d - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchFields: - - key: metadata.name - operator: In - values: - - ip-172-33-17-181.us-west-2.compute.internal - automountServiceAccountToken: false - containers: - - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9100 - - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) - - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs|erofs)$ - env: - - name: HOST_IP - value: 0.0.0.0 - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 3 - httpGet: - path: / - port: http-metrics - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - name: node-exporter - ports: - - containerPort: 9100 - hostPort: 9100 - name: http-metrics - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: / - port: http-metrics - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: {} - securityContext: - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /host/proc - name: proc - readOnly: true - - mountPath: /host/sys - name: sys - readOnly: true - - mountPath: /host/root - mountPropagation: HostToContainer - name: root - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - hostNetwork: true - hostPID: true - nodeName: ip-172-33-17-181.us-west-2.compute.internal - nodeSelector: - kubernetes.io/os: linux - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccount: kube-prometheus-stack-1754164871-prometheus-node-exporter - serviceAccountName: kube-prometheus-stack-1754164871-prometheus-node-exporter - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoSchedule - operator: Exists - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/disk-pressure - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/memory-pressure - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/pid-pressure - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/unschedulable - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/network-unavailable - operator: Exists - volumes: - - hostPath: - path: /proc - type: "" - name: proc - - hostPath: - path: /sys - type: "" - name: sys - - hostPath: - path: / - type: "" - name: root - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:52Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:52Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:52Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://e2168bde6ec19ec96668a460597bf7ffe51e3a8c731f93b6d9e1cfc5dc139f70 - image: quay.io/prometheus/node-exporter:v1.9.1 - imageID: quay.io/prometheus/node-exporter@sha256:d00a542e409ee618a4edc67da14dd48c5da66726bbd5537ab2af9c1dfc442c8a - lastState: {} - name: node-exporter - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:51Z" - volumeMounts: - - mountPath: /host/proc - name: proc - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /host/sys - name: sys - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /host/root - name: root - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.17.181 - hostIPs: - - ip: 172.33.17.181 - phase: Running - podIP: 172.33.17.181 - podIPs: - - ip: 172.33.17.181 - qosClass: BestEffort - startTime: "2025-08-02T20:01:49Z" -- apiVersion: v1 - kind: Pod - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - creationTimestamp: "2025-08-02T20:01:49Z" - generateName: kube-prometheus-stack-1754164871-prometheus-node-exporter- - labels: - app.kubernetes.io/component: metrics - app.kubernetes.io/instance: kube-prometheus-stack-1754164871 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: prometheus-node-exporter - app.kubernetes.io/part-of: prometheus-node-exporter - app.kubernetes.io/version: 1.9.1 - controller-revision-hash: 8587f5b7cd - helm.sh/chart: prometheus-node-exporter-4.47.3 - jobLabel: node-exporter - pod-template-generation: "1" - release: kube-prometheus-stack-1754164871 - name: kube-prometheus-stack-1754164871-prometheus-node-exporter-zhfng - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: DaemonSet - name: kube-prometheus-stack-1754164871-prometheus-node-exporter - uid: cc472f6b-fe45-47cb-a915-52e316494b41 - resourceVersion: "93640352" - uid: e52dc924-4e66-4571-ae7c-1f36b4ec2472 - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchFields: - - key: metadata.name - operator: In - values: - - ip-172-33-32-41.us-west-2.compute.internal - automountServiceAccountToken: false - containers: - - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9100 - - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) - - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs|erofs)$ - env: - - name: HOST_IP - value: 0.0.0.0 - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 3 - httpGet: - path: / - port: http-metrics - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - name: node-exporter - ports: - - containerPort: 9100 - hostPort: 9100 - name: http-metrics - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: / - port: http-metrics - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: {} - securityContext: - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /host/proc - name: proc - readOnly: true - - mountPath: /host/sys - name: sys - readOnly: true - - mountPath: /host/root - mountPropagation: HostToContainer - name: root - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - hostNetwork: true - hostPID: true - nodeName: ip-172-33-32-41.us-west-2.compute.internal - nodeSelector: - kubernetes.io/os: linux - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccount: kube-prometheus-stack-1754164871-prometheus-node-exporter - serviceAccountName: kube-prometheus-stack-1754164871-prometheus-node-exporter - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoSchedule - operator: Exists - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/disk-pressure - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/memory-pressure - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/pid-pressure - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/unschedulable - operator: Exists - - effect: NoSchedule - key: node.kubernetes.io/network-unavailable - operator: Exists - volumes: - - hostPath: - path: /proc - type: "" - name: proc - - hostPath: - path: /sys - type: "" - name: sys - - hostPath: - path: / - type: "" - name: root - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:51Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:51Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:51Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:49Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://36a36f636e0fb083d4a9b6d049b19e36cbe9d835b6cc41ff2e999a9264e04171 - image: quay.io/prometheus/node-exporter:v1.9.1 - imageID: quay.io/prometheus/node-exporter@sha256:d00a542e409ee618a4edc67da14dd48c5da66726bbd5537ab2af9c1dfc442c8a - lastState: {} - name: node-exporter - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:01:51Z" - volumeMounts: - - mountPath: /host/proc - name: proc - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /host/sys - name: sys - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /host/root - name: root - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.32.41 - hostIPs: - - ip: 172.33.32.41 - phase: Running - podIP: 172.33.32.41 - podIPs: - - ip: 172.33.32.41 - qosClass: BestEffort - startTime: "2025-08-02T20:01:49Z" -- apiVersion: v1 - kind: Pod - metadata: - annotations: - kubectl.kubernetes.io/default-container: prometheus - creationTimestamp: "2025-08-02T20:01:52Z" - generateName: prometheus-kube-prometheus-stack-1754-prometheus- - labels: - app.kubernetes.io/instance: kube-prometheus-stack-1754-prometheus - app.kubernetes.io/managed-by: prometheus-operator - app.kubernetes.io/name: prometheus - app.kubernetes.io/version: 3.5.0 - apps.kubernetes.io/pod-index: "0" - controller-revision-hash: prometheus-kube-prometheus-stack-1754-prometheus-7bbd956f4 - operator.prometheus.io/name: kube-prometheus-stack-1754-prometheus - operator.prometheus.io/shard: "0" - prometheus: kube-prometheus-stack-1754-prometheus - statefulset.kubernetes.io/pod-name: prometheus-kube-prometheus-stack-1754-prometheus-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-0 - namespace: prometheus - ownerReferences: - - apiVersion: apps/v1 - blockOwnerDeletion: true - controller: true - kind: StatefulSet - name: prometheus-kube-prometheus-stack-1754-prometheus - uid: 74140836-f846-474c-9bab-88205c90d209 - resourceVersion: "93640627" - uid: 6028f975-aa14-4fc8-962a-5fef7ad4ff09 - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - prometheus - - key: app.kubernetes.io/instance - operator: In - values: - - kube-prometheus-stack-1754-prometheus - topologyKey: kubernetes.io/hostname - weight: 100 - automountServiceAccountToken: true - containers: - - args: - - --config.file=/etc/prometheus/config_out/prometheus.env.yaml - - --web.enable-lifecycle - - --web.external-url=http://kube-prometheus-stack-1754-prometheus.prometheus:9090 - - --web.route-prefix=/ - - --storage.tsdb.retention.time=10d - - --storage.tsdb.path=/prometheus - - --storage.tsdb.wal-compression - - --web.config.file=/etc/prometheus/web_config/web-config.yaml - image: quay.io/prometheus/prometheus:v3.5.0 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 6 - httpGet: - path: /-/healthy - port: http-web - scheme: HTTP - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 3 - name: prometheus - ports: - - containerPort: 9090 - name: http-web - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /-/ready - port: http-web - scheme: HTTP - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 3 - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - startupProbe: - failureThreshold: 60 - httpGet: - path: /-/ready - port: http-web - scheme: HTTP - periodSeconds: 15 - successThreshold: 1 - timeoutSeconds: 3 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus/config_out - name: config-out - readOnly: true - - mountPath: /etc/prometheus/certs - name: tls-assets - readOnly: true - - mountPath: /prometheus - name: prometheus-kube-prometheus-stack-1754-prometheus-db - - mountPath: /etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - mountPath: /etc/prometheus/web_config/web-config.yaml - name: web-config - readOnly: true - subPath: web-config.yaml - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-h97gt - readOnly: true - - args: - - --listen-address=:8080 - - --reload-url=http://127.0.0.1:9090/-/reload - - --config-file=/etc/prometheus/config/prometheus.yaml.gz - - --config-envsubst-file=/etc/prometheus/config_out/prometheus.env.yaml - - --watched-dir=/etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - command: - - /bin/prometheus-config-reloader - env: - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: SHARD - value: "0" - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imagePullPolicy: IfNotPresent - name: config-reloader - ports: - - containerPort: 8080 - name: reloader-web - protocol: TCP - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus/config - name: config - - mountPath: /etc/prometheus/config_out - name: config-out - - mountPath: /etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-h97gt - readOnly: true - dnsPolicy: ClusterFirst - enableServiceLinks: true - hostname: prometheus-kube-prometheus-stack-1754-prometheus-0 - initContainers: - - args: - - --watch-interval=0 - - --listen-address=:8081 - - --config-file=/etc/prometheus/config/prometheus.yaml.gz - - --config-envsubst-file=/etc/prometheus/config_out/prometheus.env.yaml - - --watched-dir=/etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - command: - - /bin/prometheus-config-reloader - env: - - name: POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: SHARD - value: "0" - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imagePullPolicy: IfNotPresent - name: init-config-reloader - ports: - - containerPort: 8081 - name: reloader-web - protocol: TCP - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus/config - name: config - - mountPath: /etc/prometheus/config_out - name: config-out - - mountPath: /etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-h97gt - readOnly: true - nodeName: ip-172-33-32-41.us-west-2.compute.internal - preemptionPolicy: PreemptLowerPriority - priority: 0 - restartPolicy: Always - schedulerName: default-scheduler - securityContext: - fsGroup: 2000 - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault - serviceAccount: kube-prometheus-stack-1754-prometheus - serviceAccountName: kube-prometheus-stack-1754-prometheus - shareProcessNamespace: false - subdomain: prometheus-operated - terminationGracePeriodSeconds: 600 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 300 - volumes: - - name: config - secret: - defaultMode: 420 - secretName: prometheus-kube-prometheus-stack-1754-prometheus - - name: tls-assets - projected: - defaultMode: 420 - sources: - - secret: - name: prometheus-kube-prometheus-stack-1754-prometheus-tls-assets-0 - - emptyDir: - medium: Memory - name: config-out - - configMap: - defaultMode: 420 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - name: web-config - secret: - defaultMode: 420 - secretName: prometheus-kube-prometheus-stack-1754-prometheus-web-config - - emptyDir: {} - name: prometheus-kube-prometheus-stack-1754-prometheus-db - - name: kube-api-access-h97gt - projected: - defaultMode: 420 - sources: - - serviceAccountToken: - expirationSeconds: 3607 - path: token - - configMap: - items: - - key: ca.crt - path: ca.crt - name: kube-root-ca.crt - - downwardAPI: - items: - - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - path: namespace - status: - conditions: - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:54Z" - status: "True" - type: PodReadyToStartContainers - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:56Z" - status: "True" - type: Initialized - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:08Z" - status: "True" - type: Ready - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:02:08Z" - status: "True" - type: ContainersReady - - lastProbeTime: null - lastTransitionTime: "2025-08-02T20:01:52Z" - status: "True" - type: PodScheduled - containerStatuses: - - containerID: containerd://2d67bd8a01943ce3b45197a534c606a8a5e0eafc1f5bcb831b8afb11e9ed6e88 - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imageID: quay.io/prometheus-operator/prometheus-config-reloader@sha256:78aec597d87aa2b4ba947ab9190538dae93a58a67b8e930aefea1086534b02ef - lastState: {} - name: config-reloader - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:02:02Z" - volumeMounts: - - mountPath: /etc/prometheus/config - name: config - - mountPath: /etc/prometheus/config_out - name: config-out - - mountPath: /etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-h97gt - readOnly: true - recursiveReadOnly: Disabled - - containerID: containerd://f32c9431e3d0bb65fd309f5ec217348d5be0648bc9d8191e0336022e27d056a6 - image: quay.io/prometheus/prometheus:v3.5.0 - imageID: quay.io/prometheus/prometheus@sha256:63805ebb8d2b3920190daf1cb14a60871b16fd38bed42b857a3182bc621f4996 - lastState: {} - name: prometheus - ready: true - restartCount: 0 - started: true - state: - running: - startedAt: "2025-08-02T20:02:02Z" - volumeMounts: - - mountPath: /etc/prometheus/config_out - name: config-out - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /etc/prometheus/certs - name: tls-assets - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /prometheus - name: prometheus-kube-prometheus-stack-1754-prometheus-db - - mountPath: /etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - mountPath: /etc/prometheus/web_config/web-config.yaml - name: web-config - readOnly: true - recursiveReadOnly: Disabled - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-h97gt - readOnly: true - recursiveReadOnly: Disabled - hostIP: 172.33.32.41 - hostIPs: - - ip: 172.33.32.41 - initContainerStatuses: - - containerID: containerd://eb6e6b2578ed43a0861c4515d3bf4aa9b7d67fbb5b20b449ed5f07626b63cd1b - image: quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0 - imageID: quay.io/prometheus-operator/prometheus-config-reloader@sha256:78aec597d87aa2b4ba947ab9190538dae93a58a67b8e930aefea1086534b02ef - lastState: {} - name: init-config-reloader - ready: true - restartCount: 0 - started: false - state: - terminated: - containerID: containerd://eb6e6b2578ed43a0861c4515d3bf4aa9b7d67fbb5b20b449ed5f07626b63cd1b - exitCode: 0 - finishedAt: "2025-08-02T20:01:54Z" - reason: Completed - startedAt: "2025-08-02T20:01:54Z" - volumeMounts: - - mountPath: /etc/prometheus/config - name: config - - mountPath: /etc/prometheus/config_out - name: config-out - - mountPath: /etc/prometheus/rules/prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - name: prometheus-kube-prometheus-stack-1754-prometheus-rulefiles-0 - - mountPath: /var/run/secrets/kubernetes.io/serviceaccount - name: kube-api-access-h97gt - readOnly: true - recursiveReadOnly: Disabled - phase: Running - podIP: 172.33.51.237 - podIPs: - - ip: 172.33.51.237 - qosClass: BestEffort - startTime: "2025-08-02T20:01:52Z" -kind: List -metadata: - resourceVersion: "" diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 69e237987..0b03e4332 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -39,7 +39,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001" + - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001" env: - name: INFERENCE_MODEL value: "${INFERENCE_MODEL}" diff --git a/llama_stack/distribution/ui/page/distribution/mcp_servers.py b/llama_stack/distribution/ui/page/distribution/mcp_servers.py index 9ae955dc8..537b98c43 100644 --- a/llama_stack/distribution/ui/page/distribution/mcp_servers.py +++ b/llama_stack/distribution/ui/page/distribution/mcp_servers.py @@ -7,202 +7,11 @@ import json import os import streamlit as st -from typing import Dict, List, Optional, Any from llama_stack.distribution.ui.modules.api import llama_stack_api +from llama_stack_client import LlamaStackClient +from llama_stack_client.types.toolgroup_register_params import McpEndpoint -# Constants -CONFIG_DIR = os.path.expanduser("~/.llama_stack/mcp_servers") -CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json") - -# Ensure config directory exists -os.makedirs(CONFIG_DIR, exist_ok=True) - -def load_config() -> Dict[str, Any]: - """Load MCP server configurations from file.""" - if os.path.exists(CONFIG_FILE): - try: - with open(CONFIG_FILE, "r") as f: - return json.load(f) - except json.JSONDecodeError: - st.error("Error loading MCP server configuration file. Using default configuration.") - - # Default empty configuration - return { - "servers": {}, - "inputs": [] - } - -def register_mcp_servers(config: Dict[str, Any]) -> None: - """Register MCP servers as toolgroups with the LlamaStackClient.""" - servers = config.get("servers", {}) - inputs = config.get("inputs", []) - - # Process inputs to resolve values - input_values = {} - for input_config in inputs: - input_id = input_config.get("id") - if input_id: - # Get value from session state if available - input_values[input_id] = st.session_state.get(f"input_value_{input_id}", "") - - # Update provider data with MCP headers - mcp_headers = {} - - # Register each server as a toolgroup - for server_name, server_config in servers.items(): - url = server_config.get("url", "") - if not url: - continue - - try: - # Register the MCP server as a toolgroup - toolgroup_id = f"mcp::{server_name}" - - # Register the toolgroup - llama_stack_api.client.toolgroups.register( - toolgroup_id=toolgroup_id, - provider_id="model-context-protocol", - mcp_endpoint=url, - ) - - # Process headers for this server - headers = server_config.get("headers", {}) - processed_headers = {} - - for header_key, header_value in headers.items(): - # Process input references in header values - if isinstance(header_value, str) and "${input:" in header_value: - # Extract input ID from ${input:input_id} format - input_id = header_value.split("${input:")[1].split("}")[0] - if input_id in input_values: - processed_headers[header_key] = input_values[input_id] - else: - processed_headers[header_key] = header_value - - # Add headers to mcp_headers if there are any - if processed_headers: - mcp_headers[url] = processed_headers - - except Exception as e: - st.error(f"Failed to register MCP server '{server_name}': {str(e)}") - - # Update provider data with MCP headers if there are any - if mcp_headers: - provider_data = llama_stack_api.provider_data.copy() - provider_data["mcp_headers"] = mcp_headers - llama_stack_api.update_provider_data_dict(provider_data) - -def save_config(config: Dict[str, Any]) -> None: - """Save MCP server configurations to file.""" - with open(CONFIG_FILE, "w") as f: - json.dump(config, f, indent=2) - - # Register MCP servers as toolgroups - register_mcp_servers(config) - - st.success("MCP server configuration saved successfully!") - -def render_server_config(server_name: str, server_config: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any]: - """Render and edit configuration for a specific MCP server.""" - st.subheader(f"Server: {server_name}") - - # Server type - server_type = st.selectbox( - "Server Type", - ["http", "websocket"], - index=0 if server_config.get("type", "http") == "http" else 1, - key=f"type_{server_name}" - ) - - # Server URL - url = st.text_input( - "Server URL", - value=server_config.get("url", ""), - key=f"url_{server_name}" - ) - - # Headers - st.write("Headers:") - headers = server_config.get("headers", {}) - headers_container = st.container() - - with headers_container: - # Display existing headers - for header_key, header_value in list(headers.items()): - col1, col2, col3 = st.columns([3, 6, 1]) - with col1: - st.text(header_key) - with col2: - # Check if this is a reference to an input - if isinstance(header_value, str) and "${input:" in header_value: - st.text(header_value) - else: - st.text("********" if "token" in header_key.lower() or "auth" in header_key.lower() else header_value) - with col3: - if st.button("🗑️", key=f"delete_header_{server_name}_{header_key}"): - del headers[header_key] - - # Add new header - st.write("Add Header:") - header_col1, header_col2 = st.columns([1, 1]) - new_header_key = header_col1.text_input("Key", key=f"new_header_key_{server_name}") - new_header_value = header_col2.text_input("Value", key=f"new_header_value_{server_name}") - - if st.button("Add Header", key=f"add_header_{server_name}"): - if new_header_key and new_header_value: - headers[new_header_key] = new_header_value - st.experimental_rerun() - - # Construct updated server config - updated_server_config = { - "type": server_type, - "url": url, - "headers": headers - } - - return updated_server_config - -def render_input_config(input_config: Dict[str, Any], index: int) -> Dict[str, Any]: - """Render and edit configuration for an input field.""" - st.subheader(f"Input: {input_config.get('id', '')}") - - col1, col2 = st.columns([1, 1]) - - input_type = col1.selectbox( - "Type", - ["promptString"], - index=0, - key=f"input_type_{index}" - ) - - input_id = col2.text_input( - "ID", - value=input_config.get("id", ""), - key=f"input_id_{index}" - ) - - description = st.text_input( - "Description", - value=input_config.get("description", ""), - key=f"input_desc_{index}" - ) - - is_password = st.checkbox( - "Password Field", - value=input_config.get("password", False), - key=f"input_password_{index}" - ) - - # Construct updated input config - updated_input_config = { - "type": input_type, - "id": input_id, - "description": description, - "password": is_password - } - - return updated_input_config def main(): st.title("MCP Servers Configuration") @@ -214,181 +23,318 @@ def main(): MCP servers are registered as toolgroups with the ID format `mcp::{server_name}`. """) - # Load existing configuration - config = load_config() + # MCP Server Configuration + st.header("MCP Server Configuration") - # Tabs for different sections - tab1, tab2, tab3, tab4 = st.tabs(["Servers", "Inputs", "Input Values", "JSON Config"]) - - # Servers Tab - with tab1: - st.header("Configured Servers") + # Create a form for MCP configuration + with st.form("mcp_server_form"): + # Server name + server_name = st.text_input( + "Server Name", + value="github", + help="A unique name for this MCP server. Will be used in the toolgroup ID as mcp::{name}." + ) - # List existing servers - servers = config.get("servers", {}) - if not servers: - st.info("No MCP servers configured yet. Add a new server below.") + # MCP URL + mcp_url = st.text_input( + "MCP URL", + value="https://api.githubcopilot.com/mcp/", + help="The URL of the MCP server." + ) - # Server selection or creation - server_options = list(servers.keys()) + ["+ Add New Server"] - selected_server = st.selectbox("Select Server", server_options) + # Get the current value from session state + api_token = st.session_state.get(f"mcp_token_{server_name}", "") - if selected_server == "+ Add New Server": - new_server_name = st.text_input("New Server Name") - if new_server_name and st.button("Create Server"): - if new_server_name in servers: - st.error(f"Server '{new_server_name}' already exists.") - else: - servers[new_server_name] = { - "type": "http", - "url": "", - "headers": {} - } - st.experimental_rerun() - elif selected_server in servers: - # Edit existing server - updated_config = render_server_config(selected_server, servers[selected_server], config) - - col1, col2 = st.columns([1, 1]) - if col1.button("Update Server", key=f"update_{selected_server}"): - servers[selected_server] = updated_config - save_config(config) - - if col2.button("Delete Server", key=f"delete_{selected_server}"): - del servers[selected_server] - save_config(config) - st.experimental_rerun() - - # Inputs Tab - with tab2: - st.header("Input Configurations") + # Input field for API Bearer Token + mcp_token = st.text_input( + "API Bearer Token", + value=api_token, + type="password", + help="Enter your API Bearer Token. For GitHub Copilot, this should be a GitHub Personal Access Token with Copilot scope." + ) - inputs = config.get("inputs", []) - if not inputs: - st.info("No input configurations defined yet. Add a new input below.") - # Input selection or creation - input_options = [f"{i.get('id', f'Input {idx}')} ({i.get('type', 'promptString')})" for idx, i in enumerate(inputs)] - input_options.append("+ Add New Input") + # Submit button + submit_button = st.form_submit_button("Save Configuration") - selected_input_option = st.selectbox("Select Input", input_options) - - if selected_input_option == "+ Add New Input": - if st.button("Create Input"): - inputs.append({ - "type": "promptString", - "id": f"input_{len(inputs)}", - "description": "", - "password": False - }) - save_config(config) - st.experimental_rerun() - else: - # Edit existing input - selected_idx = input_options.index(selected_input_option) - if selected_idx < len(inputs): - updated_input = render_input_config(inputs[selected_idx], selected_idx) + if submit_button: + if not server_name: + st.error("Server name is required.") + elif not mcp_url: + st.error("MCP URL is required.") + else: + # Store the token in session state + st.session_state[f"mcp_token_{server_name}"] = mcp_token - col1, col2 = st.columns([1, 1]) - if col1.button("Update Input", key=f"update_input_{selected_idx}"): - inputs[selected_idx] = updated_input - save_config(config) - - if col2.button("Delete Input", key=f"delete_input_{selected_idx}"): - inputs.pop(selected_idx) - save_config(config) - st.experimental_rerun() + try: + # Register the MCP server as a toolgroup + toolgroup_id = f"mcp::{server_name}" + try: + llama_stack_api.client.toolgroups.register( + toolgroup_id=toolgroup_id, + provider_id="model-context-protocol", + mcp_endpoint=McpEndpoint(uri=mcp_url), + timeout=10.0, # Set a reasonable timeout + ) + except Exception as e: + if "timeout" in str(e).lower(): + st.warning(f"Registration timed out, but configuration will still be saved. Error: {str(e)}") + else: + raise + + # Update provider data with MCP headers + # Check if provider_data attribute exists + if not hasattr(llama_stack_api, "provider_data"): + llama_stack_api.provider_data = {} + + provider_data = llama_stack_api.provider_data.copy() + if "mcp_headers" not in provider_data: + provider_data["mcp_headers"] = {} + + # Add MCP headers + if mcp_token: + # Clean the token (remove 'Bearer ' prefix if present) + clean_token = mcp_token + if clean_token.lower().startswith("bearer "): + clean_token = clean_token[7:] + + # Set the headers for this MCP server + # The key needs to be the exact URL used in the MCP endpoint + provider_data["mcp_headers"][mcp_url] = { + "Authorization": f"Bearer {clean_token}" + } + + # Debug information + st.info(f"Set authentication headers for {mcp_url}: Bearer {clean_token[:4]}...") + + # Also set the token directly in the provider_data using different formats + # This increases the chance that one of them will work + provider_data[f"{server_name}_api_key"] = clean_token + provider_data[f"{server_name}_token"] = clean_token + provider_data[f"{server_name}_mcp_token"] = clean_token + + # Display the current provider_data for debugging + st.write("Current provider_data:") + + # Mask the token for display + masked_token = "" + if clean_token: + if len(clean_token) > 8: + # Show first 2 and last 2 characters, mask the middle with asterisks + masked_token = f"Bearer {clean_token[:2]}{'*' * 6}{clean_token[-2:]}" + else: + # For short tokens, just show first 2 chars and asterisks + masked_token = f"Bearer {clean_token[:2]}{'*' * 4}" + + # Create a sanitized version of mcp_headers for display + sanitized_headers = {} + for url, headers in provider_data.get("mcp_headers", {}).items(): + sanitized_headers[url] = {} + for header_key, header_value in headers.items(): + if header_key.lower() == "authorization" and isinstance(header_value, str): + if header_value.lower().startswith("bearer "): + token = header_value[7:] + if len(token) > 8: + sanitized_headers[url][header_key] = f"Bearer {token[:2]}{'*' * 6}{token[-2:]}" + else: + sanitized_headers[url][header_key] = f"Bearer {token[:2]}{'*' * 4}" + else: + sanitized_headers[url][header_key] = f"{header_value[:2]}{'*' * 6}" + else: + sanitized_headers[url][header_key] = header_value + + st.json({ + "mcp_headers": sanitized_headers, + f"{server_name}_api_key": masked_token + }) + + # Add a note about authentication + st.warning(""" + **Important Authentication Notes:** + + 1. If you encounter authentication errors when using this MCP server, + try restarting the UI application to ensure the authentication headers are properly applied. + + 2. For GitHub Copilot, make sure you're using a valid GitHub Personal Access Token with the 'copilot' scope. + + 3. When using the MCP server in code, you may need to explicitly pass the authentication headers: + ```python + import json + from llama_stack_client import Agent + + agent = Agent( + model="llama-3-70b-instruct", + tools=["mcp::github"], + extra_headers={ + "X-LlamaStack-Provider-Data": json.dumps({ + "mcp_headers": { + "https://api.githubcopilot.com/mcp/": { + "Authorization": "Bearer YOUR_TOKEN_HERE" + } + } + }) + } + ) + ``` + """) + + # Update the client with the new provider data + if hasattr(llama_stack_api, "update_provider_data_dict"): + llama_stack_api.update_provider_data_dict(provider_data) + else: + # Fallback implementation if method doesn't exist + llama_stack_api.provider_data = provider_data + # Reinitialize the client with updated provider data + llama_stack_api.client = LlamaStackClient( + base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"), + provider_data=provider_data, + ) + + st.success(f"MCP server '{server_name}' configured successfully!") + except Exception as e: + st.error(f"Failed to configure MCP server '{server_name}': {str(e)}") - # Input Values Tab - with tab3: - st.header("Input Values") - - inputs = config.get("inputs", []) - if not inputs: - st.info("No input configurations defined yet. Add inputs in the Inputs tab.") - else: - st.write("Enter values for the configured inputs:") - - for input_config in inputs: - input_id = input_config.get("id", "") - description = input_config.get("description", "") - is_password = input_config.get("password", False) - - # Get value from session state if available - current_value = st.session_state.get(f"input_value_{input_id}", "") - - # Input field for value - value = st.text_input( - f"{description} ({input_id})", - value=current_value, - type="password" if is_password else "default", - key=f"input_value_{input_id}" - ) - - if st.button("Save Input Values"): - # Values are automatically saved to session state - # Register MCP servers to apply the new values - register_mcp_servers(config) - st.success("Input values saved successfully!") - - # JSON Config Tab - with tab4: - st.header("Raw JSON Configuration") - - # Display and edit raw JSON - json_str = json.dumps(config, indent=2) - edited_json = st.text_area("Edit JSON Configuration", json_str, height=400) - - if st.button("Update from JSON"): - try: - updated_config = json.loads(edited_json) - save_config(updated_config) - st.experimental_rerun() - except json.JSONDecodeError as e: - st.error(f"Invalid JSON: {str(e)}") - - # Example configuration and usage - with st.expander("Example Configuration and Usage"): + # Usage example + with st.expander("Usage Example"): st.markdown(""" - ### Example Configuration + ### Using MCP Servers in Code - ```json - { - "servers": { - "github": { - "type": "http", - "url": "https://api.githubcopilot.com/mcp/", - "headers": { - "Authorization": "Bearer ${input:github_mcp_pat}" - } - } - }, - "inputs": [ - { - "type": "promptString", - "id": "github_mcp_pat", - "description": "GitHub Personal Access Token", - "password": true - } - ] - } - ``` - - ### Usage in Code - - Once registered, you can use the MCP server in your code: + Once configured, you can use the MCP server in your code: ```python from llama_stack_client import Agent agent = Agent( model="llama-3-70b-instruct", - tools=["mcp::github"], # Use the registered MCP server + tools=["mcp::your_server_name"], # Use the registered MCP server ) - agent.create_turn("Use GitHub Copilot to help me write a function") + agent.create_turn("Use the MCP server to help me with a task") + ``` + + ### With Explicit Authentication Headers + + If you encounter authentication issues, you can explicitly pass the authentication headers: + + ```python + import json + from llama_stack_client import Agent + + agent = Agent( + model="llama-3-70b-instruct", + tools=["mcp::github"], + extra_headers={ + "X-LlamaStack-Provider-Data": json.dumps({ + "mcp_headers": { + "https://api.githubcopilot.com/mcp/": { + "Authorization": "Bearer YOUR_TOKEN_HERE" + } + } + }) + } + ) ``` """) + + # Display registered toolgroups + st.header("Registered MCP Toolgroups") + try: + toolgroups = llama_stack_api.client.toolgroups.list(timeout=5.0) # Set a reasonable timeout + + # Debug the structure of the toolgroups + if toolgroups: + # Check the first toolgroup to determine its structure + first_toolgroup = toolgroups[0] if toolgroups else None + + if first_toolgroup: + # Get the attribute names + attr_names = dir(first_toolgroup) + + # The ToolGroup class has an 'identifier' attribute as per the API definition + id_attr = 'identifier' + + # Filter MCP toolgroups based on the identifier attribute + mcp_toolgroups = [] + for tg in toolgroups: + if hasattr(tg, 'identifier') and isinstance(tg.identifier, str) and tg.identifier.startswith("mcp::"): + mcp_toolgroups.append(tg) + + if mcp_toolgroups: + # Display MCP servers with unregister buttons + st.write("Click the button next to a server to unregister it:") + + for tg in mcp_toolgroups: + col1, col2, col3, col4 = st.columns([3, 2, 3, 1]) + + with col1: + st.write(f"**{tg.identifier}**") + + with col2: + st.write(tg.provider_id) + + with col3: + if hasattr(tg, 'mcp_endpoint') and tg.mcp_endpoint: + st.write(tg.mcp_endpoint.uri) + else: + st.write("N/A") + + with col4: + # Extract server name from identifier (remove "mcp::" prefix) + server_name = tg.identifier.replace("mcp::", "") + if st.button("Unregister", key=f"unregister_{server_name}"): + try: + # Call the unregister API + llama_stack_api.client.toolgroups.unregister( + toolgroup_id=tg.identifier, + timeout=5.0 + ) + + # Also clean up provider data + if hasattr(llama_stack_api, "provider_data"): + provider_data = llama_stack_api.provider_data.copy() + + # Remove MCP headers for this server if they exist + if "mcp_headers" in provider_data and hasattr(tg, 'mcp_endpoint') and tg.mcp_endpoint: + if tg.mcp_endpoint.uri in provider_data["mcp_headers"]: + del provider_data["mcp_headers"][tg.mcp_endpoint.uri] + + # Remove server-specific tokens + keys_to_remove = [ + f"{server_name}_api_key", + f"{server_name}_token", + f"{server_name}_mcp_token" + ] + for key in keys_to_remove: + if key in provider_data: + del provider_data[key] + + # Update the client with the modified provider data + if hasattr(llama_stack_api, "update_provider_data_dict"): + llama_stack_api.update_provider_data_dict(provider_data) + else: + # Fallback implementation + llama_stack_api.provider_data = provider_data + llama_stack_api.client = LlamaStackClient( + base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"), + provider_data=provider_data, + ) + + st.success(f"Successfully unregistered {tg.identifier}") + st.rerun() # Refresh the page to update the list + except Exception as e: + st.error(f"Failed to unregister {tg.identifier}: {str(e)}") + else: + st.info("No MCP toolgroups registered yet.") + else: + st.info("No toolgroups found.") + else: + st.info("No toolgroups returned from the API.") + except Exception as e: + if "timeout" in str(e).lower(): + st.warning("Listing toolgroups timed out. The server might be busy or unreachable.") + else: + st.error(f"Failed to list toolgroups: {str(e)}") + if __name__ == "__main__": main() diff --git a/llama_stack/distribution/ui/page/playground/tools.py b/llama_stack/distribution/ui/page/playground/tools.py index 149d8cce9..f7e918b32 100644 --- a/llama_stack/distribution/ui/page/playground/tools.py +++ b/llama_stack/distribution/ui/page/playground/tools.py @@ -80,9 +80,14 @@ def tool_chat_page(): total_tools = 0 for toolgroup_id in toolgroup_selection: - tools = client.tools.list(toolgroup_id=toolgroup_id) - grouped_tools[toolgroup_id] = [tool.identifier for tool in tools] - total_tools += len(tools) + try: + # Add a timeout of 5 seconds to prevent UI freezing + tools = client.tools.list(toolgroup_id=toolgroup_id, timeout=5.0) + grouped_tools[toolgroup_id] = [tool.identifier for tool in tools] + total_tools += len(tools) + except Exception as e: + st.warning(f"Failed to list tools for {toolgroup_id}: {str(e)}") + grouped_tools[toolgroup_id] = [] st.markdown(f"Active Tools: 🛠 {total_tools}") @@ -126,25 +131,56 @@ def tool_chat_page(): @st.cache_resource def create_agent(): - if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT: - return ReActAgent( - client=client, - model=model, - tools=toolgroup_selection, - response_format={ - "type": "json_schema", - "json_schema": ReActOutput.model_json_schema(), - }, - sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens}, - ) - else: - return Agent( - client, - model=model, - instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.", - tools=toolgroup_selection, - sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens}, + try: + if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT: + return ReActAgent( + client=client, + model=model, + tools=toolgroup_selection, + response_format={ + "type": "json_schema", + "json_schema": ReActOutput.model_json_schema(), + }, + sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens}, + ) + else: + return Agent( + client, + model=model, + instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.", + tools=toolgroup_selection, + sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens}, + ) + except Exception as e: + # Log the error + st.error(f"Failed to create agent: {str(e)}") + + # Create a fallback agent without tools + st.warning( + "Creating a fallback agent without tools due to an error. " + "Some functionality may be limited. Try refreshing the page or selecting different tools." ) + + # Return a basic agent without tools + if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT: + return ReActAgent( + client=client, + model=model, + tools=[], # No tools + response_format={ + "type": "json_schema", + "json_schema": ReActOutput.model_json_schema(), + }, + sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens}, + ) + else: + return Agent( + client, + model=model, + instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.", + tools=[], # No tools + sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens}, + ) st.session_state.agent_type = agent_type