mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 16:54:42 +00:00
temp checkpoint
This commit is contained in:
parent
a2bbb17fdd
commit
67f19f76b2
7 changed files with 1032 additions and 25 deletions
|
@ -18,7 +18,7 @@ export POSTGRES_PASSWORD=llamastack
|
||||||
|
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export CODE_MODEL=bigcode/starcoder2-7b
|
export CODE_MODEL=bigcode/starcoder2-7b
|
||||||
|
export OLLAMA_MODEL=llama-guard3:1b
|
||||||
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
||||||
export USE_EBS=${USE_EBS:-false}
|
export USE_EBS=${USE_EBS:-false}
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
@ -44,10 +44,6 @@ else
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
|
|
||||||
echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,6 +80,7 @@ echo "Secret verification successful. All required secrets are present."
|
||||||
if [ "$USE_EBS" = "true" ]; then
|
if [ "$USE_EBS" = "true" ]; then
|
||||||
echo "Using EBS storage for persistent volumes"
|
echo "Using EBS storage for persistent volumes"
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
||||||
|
envsubst < ./ollama-safety-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
||||||
|
@ -114,6 +111,7 @@ else
|
||||||
echo "Using emptyDir for storage (data will not persist across pod restarts)"
|
echo "Using emptyDir for storage (data will not persist across pod restarts)"
|
||||||
# Process templates to replace EBS storage with emptyDir
|
# Process templates to replace EBS storage with emptyDir
|
||||||
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./ollama-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
|
|
@ -29,48 +29,130 @@ if [ -n "${NGC_API_KEY:-}" ]; then
|
||||||
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
|
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Define namespace - default to current namespace if not specified
|
||||||
|
export NAMESPACE=${NAMESPACE:-$(kubectl config view --minify -o jsonpath='{..namespace}')}
|
||||||
|
if [ -z "$NAMESPACE" ]; then
|
||||||
|
export NAMESPACE="default"
|
||||||
|
fi
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Delete resources in reverse order of creation to handle dependencies properly
|
# Delete resources in reverse order of creation to handle dependencies properly
|
||||||
|
|
||||||
# Delete UI deployment
|
echo "Starting comprehensive deletion of all LlamaStack resources..."
|
||||||
|
|
||||||
|
# Delete UI deployment and service
|
||||||
|
echo "Deleting UI resources..."
|
||||||
envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
# Check for UI service template and delete if exists
|
||||||
|
if [ -f "./ui-service-k8s.yaml.template" ]; then
|
||||||
|
envsubst < ./ui-service-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
fi
|
||||||
|
|
||||||
# Delete monitoring resources
|
# Delete monitoring resources
|
||||||
|
echo "Deleting monitoring resources..."
|
||||||
envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete Prometheus RBAC resources
|
# Delete Prometheus RBAC resources
|
||||||
|
echo "Deleting Prometheus RBAC resources..."
|
||||||
kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true
|
kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Delete ingress
|
# Delete ingress
|
||||||
|
echo "Deleting ingress resources..."
|
||||||
envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete stack deployment
|
# Delete stack deployment
|
||||||
|
echo "Deleting stack deployment..."
|
||||||
envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete configmap
|
# Delete configmaps
|
||||||
|
echo "Deleting configmaps..."
|
||||||
kubectl delete configmap llama-stack-config --ignore-not-found=true
|
kubectl delete configmap llama-stack-config --ignore-not-found=true
|
||||||
|
# Check for stack configmap and delete if exists
|
||||||
|
if [ -f "./stack-configmap.yaml" ]; then
|
||||||
|
kubectl delete -f ./stack-configmap.yaml --ignore-not-found=true
|
||||||
|
fi
|
||||||
|
|
||||||
# Delete chroma deployment
|
# Delete chroma deployment
|
||||||
|
echo "Deleting chroma deployment..."
|
||||||
envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete postgres deployment
|
# Delete postgres deployment
|
||||||
|
echo "Deleting postgres deployment..."
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete vllm-safety deployment
|
# Delete llama-nim deployment
|
||||||
|
echo "Deleting llama-nim deployment..."
|
||||||
envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
|
||||||
|
# Delete ollama-safety deployment
|
||||||
|
echo "Deleting ollama-safety deployment..."
|
||||||
|
envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete vllm deployment
|
# Delete vllm deployment
|
||||||
|
echo "Deleting vllm deployment..."
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete the HF token secret if it exists
|
# Delete the HF token secret if it exists
|
||||||
if [ -n "${HF_TOKEN:-}" ]; then
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
|
echo "Deleting HF token secret..."
|
||||||
envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
|
# Delete any other template files that might exist
|
||||||
|
echo "Checking for additional template files..."
|
||||||
|
for template in ./*.yaml.template; do
|
||||||
|
if [ -f "$template" ]; then
|
||||||
|
# Skip templates we've already processed
|
||||||
|
if [[ "$template" != "./ui-k8s.yaml.template" &&
|
||||||
|
"$template" != "./monitoring-k8s.yaml.template" &&
|
||||||
|
"$template" != "./ingress-k8s.yaml.template" &&
|
||||||
|
"$template" != "./stack-k8s.yaml.template" &&
|
||||||
|
"$template" != "./chroma-k8s.yaml.template" &&
|
||||||
|
"$template" != "./postgres-k8s.yaml.template" &&
|
||||||
|
"$template" != "./llama-nim.yaml.template" &&
|
||||||
|
"$template" != "./vllm-safety-k8s.yaml.template" &&
|
||||||
|
"$template" != "./ollama-safety-k8s.yaml.template" &&
|
||||||
|
"$template" != "./vllm-k8s.yaml.template" &&
|
||||||
|
"$template" != "./set-secret.yaml.template" &&
|
||||||
|
"$template" != "./ui-service-k8s.yaml.template" ]]; then
|
||||||
|
echo "Deleting resources from $template..."
|
||||||
|
envsubst < "$template" | kubectl delete -f - --ignore-not-found=true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
echo "All LlamaStack Kubernetes resources have been deleted."
|
# Delete any PersistentVolumeClaims created by the stack
|
||||||
|
echo "Deleting PersistentVolumeClaims..."
|
||||||
|
kubectl delete pvc -l app=llama-stack --ignore-not-found=true
|
||||||
|
kubectl delete pvc -l app=chroma --ignore-not-found=true
|
||||||
|
kubectl delete pvc -l app=postgres --ignore-not-found=true
|
||||||
|
kubectl delete pvc -l app=vllm --ignore-not-found=true
|
||||||
|
kubectl delete pvc -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete any remaining services
|
||||||
|
echo "Deleting any remaining services..."
|
||||||
|
kubectl delete service -l app=llama-stack --ignore-not-found=true
|
||||||
|
kubectl delete service -l app=chroma --ignore-not-found=true
|
||||||
|
kubectl delete service -l app=postgres --ignore-not-found=true
|
||||||
|
kubectl delete service -l app=vllm --ignore-not-found=true
|
||||||
|
kubectl delete service -l app=llama-nim --ignore-not-found=true
|
||||||
|
kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete any remaining secrets
|
||||||
|
echo "Deleting any remaining secrets..."
|
||||||
|
kubectl delete secret hf-secret --ignore-not-found=true
|
||||||
|
kubectl delete secret ngc-secret --ignore-not-found=true
|
||||||
|
kubectl delete secret -l app=llama-stack --ignore-not-found=true
|
||||||
|
|
||||||
|
# Verify no resources remain
|
||||||
|
echo "Verifying deletion..."
|
||||||
|
REMAINING_RESOURCES=$(kubectl get all -l app=llama-stack 2>/dev/null)
|
||||||
|
if [ -z "$REMAINING_RESOURCES" ]; then
|
||||||
|
echo "All LlamaStack Kubernetes resources have been successfully deleted."
|
||||||
|
else
|
||||||
|
echo "Some LlamaStack resources may still exist. Please check manually with:"
|
||||||
|
echo "kubectl get all -l app=llama-stack"
|
||||||
|
fi
|
||||||
|
|
|
@ -18,7 +18,7 @@ data:
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
# NVIDIA DCGM exporter for GPU metrics
|
# NVIDIA DCGM exporter for GPU metrics
|
||||||
- job_name: 'nvidia-dcgm-exporter'
|
- job_name: 'dcgm'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['dcgm-exporter:9400']
|
- targets: ['dcgm-exporter:9400']
|
||||||
|
|
||||||
|
@ -98,6 +98,30 @@ data:
|
||||||
- targets: ['llm-nim-code:8000']
|
- targets: ['llm-nim-code:8000']
|
||||||
metrics_path: /v1/metrics
|
metrics_path: /v1/metrics
|
||||||
scrape_interval: 5s
|
scrape_interval: 5s
|
||||||
|
|
||||||
|
# Specific job for Ollama pods
|
||||||
|
- job_name: 'ollama-pods'
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||||
|
regex: ollama-safety
|
||||||
|
action: keep
|
||||||
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||||
|
action: replace
|
||||||
|
target_label: __metrics_path__
|
||||||
|
regex: (.+)
|
||||||
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||||
|
action: replace
|
||||||
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||||
|
replacement: $1:$2
|
||||||
|
target_label: __address__
|
||||||
|
- source_labels: [__meta_kubernetes_namespace]
|
||||||
|
action: replace
|
||||||
|
target_label: kubernetes_namespace
|
||||||
|
- source_labels: [__meta_kubernetes_pod_name]
|
||||||
|
action: replace
|
||||||
|
target_label: kubernetes_pod_name
|
||||||
---
|
---
|
||||||
# Prometheus Deployment
|
# Prometheus Deployment
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
|
@ -262,9 +286,9 @@ data:
|
||||||
"spaceLength": 10,
|
"spaceLength": 10,
|
||||||
"stack": false,
|
"stack": false,
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "DCGM_FI_DEV_GPU_UTIL{pod=~\"llm-nim-code.*\"}",
|
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
||||||
}
|
}
|
||||||
|
@ -351,7 +375,7 @@ data:
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "DCGM_FI_DEV_FB_USED{pod=~\"llm-nim-code.*\"} / 1024 / 1024 / 1024",
|
"expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"} / 1024 / 1024 / 1024",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
||||||
}
|
}
|
||||||
|
@ -438,7 +462,7 @@ data:
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "DCGM_FI_DEV_POWER_USAGE{pod=~\"llm-nim-code.*\"}",
|
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
||||||
}
|
}
|
||||||
|
@ -525,7 +549,7 @@ data:
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "DCGM_FI_DEV_GPU_TEMP{pod=~\"llm-nim-code.*\"}",
|
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
"legendFormat": "GPU {{gpu}} - {{pod}}"
|
||||||
}
|
}
|
||||||
|
@ -954,6 +978,763 @@ data:
|
||||||
"uid": "nim-dashboard",
|
"uid": "nim-dashboard",
|
||||||
"version": 1
|
"version": 1
|
||||||
}
|
}
|
||||||
|
ollama-dashboard.json: |
|
||||||
|
{
|
||||||
|
"annotations": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"builtIn": 1,
|
||||||
|
"datasource": "-- Grafana --",
|
||||||
|
"enable": true,
|
||||||
|
"hide": true,
|
||||||
|
"iconColor": "rgba(0, 211, 255, 1)",
|
||||||
|
"name": "Annotations & Alerts",
|
||||||
|
"type": "dashboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"editable": true,
|
||||||
|
"gnetId": null,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": 3,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 1,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(ollama_requests_total{kubernetes_pod_name=~\"ollama-safety.*\"}[1m])",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama Request Rate",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "reqps",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 2,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "p95"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "p50"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama Request Latency",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "s",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 3,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ollama_active_requests{kubernetes_pod_name=~\"ollama-safety.*\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama Active Requests",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 4,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ollama_tokens_per_second{kubernetes_pod_name=~\"ollama-safety.*\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama Tokens Per Second",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 16
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 5,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(kubernetes_pod_name) (container_memory_usage_bytes{pod=~\"ollama-safety.*\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama Memory Usage",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "bytes",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 16
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 6,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{pod=~\"ollama-safety.*\"}[5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama CPU Usage",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 24
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 7,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ollama_gpu_memory_used_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Used - {{kubernetes_pod_name}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "ollama_gpu_memory_total_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Total - {{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama GPU Memory",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "bytes",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 24
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 8,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": true,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"dataLinks": []
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ollama_gpu_utilization{kubernetes_pod_name=~\"ollama-safety.*\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{kubernetes_pod_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Ollama GPU Utilization",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "percent",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": "100",
|
||||||
|
"min": "0",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 22,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": [],
|
||||||
|
"templating": {
|
||||||
|
"list": []
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-6h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {
|
||||||
|
"refresh_intervals": [
|
||||||
|
"5s",
|
||||||
|
"10s",
|
||||||
|
"30s",
|
||||||
|
"1m",
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"30m",
|
||||||
|
"1h",
|
||||||
|
"2h",
|
||||||
|
"1d"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"timezone": "",
|
||||||
|
"title": "Ollama Monitoring",
|
||||||
|
"uid": "ollama-dashboard",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
k8s-pods-dashboard.json: |
|
k8s-pods-dashboard.json: |
|
||||||
{
|
{
|
||||||
"annotations": {
|
"annotations": {
|
||||||
|
@ -1378,8 +2159,9 @@ data:
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "gpu_utilization{model_name=~\".+\"}",
|
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\"}",
|
||||||
"refId": "A"
|
"refId": "A",
|
||||||
|
"legendFormat": "GPU {{gpu}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [],
|
"thresholds": [],
|
||||||
|
@ -1464,8 +2246,9 @@ data:
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "gpu_memory_used_bytes{model_name=~\".+\"} / gpu_memory_total_bytes{model_name=~\".+\"} * 100",
|
"expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} / (DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} + DCGM_FI_DEV_FB_FREE{instance=\"dcgm-exporter:9400\"}) * 100",
|
||||||
"refId": "A"
|
"refId": "A",
|
||||||
|
"legendFormat": "GPU {{gpu}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [],
|
"thresholds": [],
|
||||||
|
@ -1550,8 +2333,9 @@ data:
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "gpu_power_usage_watts{model_name=~\".+\"}",
|
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\"}",
|
||||||
"refId": "A"
|
"refId": "A",
|
||||||
|
"legendFormat": "GPU {{gpu}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [],
|
"thresholds": [],
|
||||||
|
@ -2298,20 +3082,65 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: dcgm-exporter
|
- name: dcgm-exporter
|
||||||
image: nvidia/dcgm-exporter:2.4.6-2.6.10-ubuntu20.04
|
image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: false
|
runAsNonRoot: false
|
||||||
runAsUser: 0
|
runAsUser: 0
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 9400
|
- containerPort: 9400
|
||||||
name: metrics
|
name: metrics
|
||||||
|
args:
|
||||||
|
- -f
|
||||||
|
- /etc/dcgm-exporter/dcp-metrics-included.csv
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: device-metrics
|
- name: device-metrics
|
||||||
mountPath: /dev/metrics
|
mountPath: /dev/metrics
|
||||||
|
- name: dcgm-config
|
||||||
|
mountPath: /etc/dcgm-exporter
|
||||||
volumes:
|
volumes:
|
||||||
- name: device-metrics
|
- name: device-metrics
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /dev/metrics
|
path: /dev/metrics
|
||||||
|
- name: dcgm-config
|
||||||
|
configMap:
|
||||||
|
name: dcgm-config
|
||||||
|
---
|
||||||
|
# DCGM Exporter ConfigMap for metrics configuration
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: dcgm-config
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
dcp-metrics-included.csv: |
|
||||||
|
# Format:
|
||||||
|
# DCGM_FI_DEV_<fieldname>, <description>, <units>
|
||||||
|
DCGM_FI_DEV_GPU_UTIL, GPU utilization, %
|
||||||
|
DCGM_FI_DEV_MEM_COPY_UTIL, Memory utilization, %
|
||||||
|
DCGM_FI_DEV_FB_FREE, Framebuffer memory free, MiB
|
||||||
|
DCGM_FI_DEV_FB_USED, Framebuffer memory used, MiB
|
||||||
|
DCGM_FI_DEV_FB_TOTAL, Framebuffer memory total, MiB
|
||||||
|
DCGM_FI_DEV_GPU_TEMP, GPU temperature, C
|
||||||
|
DCGM_FI_DEV_POWER_USAGE, Power usage, W
|
||||||
|
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, Total energy consumption, mJ
|
||||||
|
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, PCIe replay counter, count
|
||||||
|
DCGM_FI_DEV_PCIE_TX_THROUGHPUT, PCIe transmit throughput, KiB/s
|
||||||
|
DCGM_FI_DEV_PCIE_RX_THROUGHPUT, PCIe receive throughput, KiB/s
|
||||||
|
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, NVLink bandwidth total, KiB/s
|
||||||
|
DCGM_FI_DEV_VGPU_LICENSE_STATUS, vGPU license status, N/A
|
||||||
|
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, Uncorrectable remapped rows, count
|
||||||
|
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, Correctable remapped rows, count
|
||||||
|
DCGM_FI_DEV_ROW_REMAP_FAILURE, Row remap failure, count
|
||||||
|
DCGM_FI_DEV_XID_ERRORS, XID errors, count
|
||||||
|
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, NVLink CRC flit error count total, count
|
||||||
|
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, NVLink CRC data error count total, count
|
||||||
|
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, NVLink replay error count total, count
|
||||||
|
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, NVLink recovery error count total, count
|
||||||
|
DCGM_FI_DEV_RETIRED_PAGES_SBE, Retired pages SBE, count
|
||||||
|
DCGM_FI_DEV_RETIRED_PAGES_DBE, Retired pages DBE, count
|
||||||
|
DCGM_FI_DEV_RETIRED_PAGES_PENDING, Retired pages pending, count
|
||||||
|
DCGM_FI_DEV_GRAPHICS_PIDS, Graphics processes, count
|
||||||
|
DCGM_FI_DEV_COMPUTE_PIDS, Compute processes, count
|
||||||
---
|
---
|
||||||
# NVIDIA DCGM Exporter Service
|
# NVIDIA DCGM Exporter Service
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: ollama-models-safety
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
volumeMode: Filesystem
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: ollama-safety
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: ollama-safety
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: ollama-safety
|
||||||
|
workload-type: inference
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "11434"
|
||||||
|
prometheus.io/path: "/metrics"
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: ollama-safety
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
command: ["/bin/sh", "-c"]
|
||||||
|
args: [
|
||||||
|
"ollama serve & sleep 5 && ollama pull llama-guard3:1b && ollama run llama-guard3:1b & wait"
|
||||||
|
]
|
||||||
|
env:
|
||||||
|
- name: OLLAMA_HOST
|
||||||
|
value: "0.0.0.0"
|
||||||
|
- name: OLLAMA_PORT
|
||||||
|
value: "11434"
|
||||||
|
- name: OLLAMA_ENABLE_METRICS
|
||||||
|
value: "true"
|
||||||
|
ports:
|
||||||
|
- containerPort: 11434
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "8Gi"
|
||||||
|
cpu: "6000m"
|
||||||
|
limits:
|
||||||
|
memory: "16Gi"
|
||||||
|
cpu: "6000m"
|
||||||
|
volumeMounts:
|
||||||
|
- name: ollama-storage
|
||||||
|
mountPath: /root/.ollama
|
||||||
|
volumes:
|
||||||
|
- name: ollama-storage
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: ollama-models-safety
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: ollama-server-safety
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: ollama-safety
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 11434
|
||||||
|
targetPort: 11434
|
||||||
|
type: ClusterIP
|
|
@ -25,6 +25,10 @@ data:
|
||||||
url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
|
url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
|
||||||
api_key: ${env.NVIDIA_API_KEY:=}
|
api_key: ${env.NVIDIA_API_KEY:=}
|
||||||
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
||||||
|
- provider_id: ollama-safety
|
||||||
|
provider_type: remote::ollama
|
||||||
|
config:
|
||||||
|
url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1}
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -108,6 +112,12 @@ data:
|
||||||
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
|
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
|
||||||
provider_id: nvidia
|
provider_id: nvidia
|
||||||
model_type: llm
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b}
|
||||||
|
provider_id: ollama-safety
|
||||||
|
model_type: llm
|
||||||
|
shields:
|
||||||
|
- shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
|
@ -55,6 +55,8 @@ spec:
|
||||||
value: "3072"
|
value: "3072"
|
||||||
- name: NVIDIA_BASE_URL
|
- name: NVIDIA_BASE_URL
|
||||||
value: http://llm-nim-code.default.svc.cluster.local:8000
|
value: http://llm-nim-code.default.svc.cluster.local:8000
|
||||||
|
- name: OLLAMA_BASE_URL
|
||||||
|
value: http://ollama-safety.default.svc.cluster.local:8000
|
||||||
- name: POSTGRES_HOST
|
- name: POSTGRES_HOST
|
||||||
value: postgres-server.default.svc.cluster.local
|
value: postgres-server.default.svc.cluster.local
|
||||||
- name: POSTGRES_PORT
|
- name: POSTGRES_PORT
|
||||||
|
@ -67,6 +69,8 @@ spec:
|
||||||
value: "${CODE_MODEL}"
|
value: "${CODE_MODEL}"
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
- name: TAVILY_SEARCH_API_KEY
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
value: "${TAVILY_SEARCH_API_KEY}"
|
||||||
|
- name: OLLAMA_MODLE
|
||||||
|
value: "${OLLAMA_MODEL}"
|
||||||
command: ["/bin/sh"]
|
command: ["/bin/sh"]
|
||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
|
|
|
@ -22,6 +22,10 @@ providers:
|
||||||
url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
|
url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1}
|
||||||
api_key: ${env.NVIDIA_API_KEY:=}
|
api_key: ${env.NVIDIA_API_KEY:=}
|
||||||
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
||||||
|
- provider_id: ollama-safety
|
||||||
|
provider_type: remote::ollama
|
||||||
|
config:
|
||||||
|
url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1}
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -105,6 +109,12 @@ models:
|
||||||
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
|
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
|
||||||
provider_id: nvidia
|
provider_id: nvidia
|
||||||
model_type: llm
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b}
|
||||||
|
provider_id: ollama-safety
|
||||||
|
model_type: llm
|
||||||
|
shields:
|
||||||
|
- shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue