diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh index e8f3a9243..8f1852c9a 100755 --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -18,7 +18,7 @@ export POSTGRES_PASSWORD=llamastack export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export CODE_MODEL=bigcode/starcoder2-7b - +export OLLAMA_MODEL=llama-guard3:1b # Set USE_EBS to false if you don't have permission to use EKS EBS export USE_EBS=${USE_EBS:-false} set -euo pipefail @@ -44,10 +44,6 @@ else exit 1 fi -if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then - echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide" - exit 1 -fi @@ -84,6 +80,7 @@ echo "Secret verification successful. All required secrets are present." if [ "$USE_EBS" = "true" ]; then echo "Using EBS storage for persistent volumes" envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - + envsubst < ./ollama-safety-k8s.yaml.template | kubectl apply -f - envsubst < ./llama-nim.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - @@ -114,6 +111,7 @@ else echo "Using emptyDir for storage (data will not persist across pod restarts)" # Process templates to replace EBS storage with emptyDir envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./ollama-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./llama-nim.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - diff --git a/docs/source/distributions/k8s/delete.sh b/docs/source/distributions/k8s/delete.sh index b6f379002..a381899e7 100755 --- a/docs/source/distributions/k8s/delete.sh +++ b/docs/source/distributions/k8s/delete.sh @@ -29,48 +29,130 @@ if [ -n "${NGC_API_KEY:-}" ]; then export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64) fi +# Define namespace - default to current namespace if not specified +export NAMESPACE=${NAMESPACE:-$(kubectl config view --minify -o jsonpath='{..namespace}')} +if [ -z "$NAMESPACE" ]; then + export NAMESPACE="default" +fi + set -euo pipefail set -x # Delete resources in reverse order of creation to handle dependencies properly -# Delete UI deployment +echo "Starting comprehensive deletion of all LlamaStack resources..." + +# Delete UI deployment and service +echo "Deleting UI resources..." envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true +# Check for UI service template and delete if exists +if [ -f "./ui-service-k8s.yaml.template" ]; then + envsubst < ./ui-service-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true +fi # Delete monitoring resources +echo "Deleting monitoring resources..." envsubst < ./monitoring-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete Prometheus RBAC resources +echo "Deleting Prometheus RBAC resources..." kubectl delete -f ./prometheus-rbac.yaml --ignore-not-found=true - - # Delete ingress +echo "Deleting ingress resources..." envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete stack deployment +echo "Deleting stack deployment..." envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true -# Delete configmap +# Delete configmaps +echo "Deleting configmaps..." kubectl delete configmap llama-stack-config --ignore-not-found=true +# Check for stack configmap and delete if exists +if [ -f "./stack-configmap.yaml" ]; then + kubectl delete -f ./stack-configmap.yaml --ignore-not-found=true +fi # Delete chroma deployment +echo "Deleting chroma deployment..." envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete postgres deployment +echo "Deleting postgres deployment..." envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true -# Delete vllm-safety deployment +# Delete llama-nim deployment +echo "Deleting llama-nim deployment..." envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete ollama-safety deployment +echo "Deleting ollama-safety deployment..." +envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + # Delete vllm deployment +echo "Deleting vllm deployment..." envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete the HF token secret if it exists if [ -n "${HF_TOKEN:-}" ]; then + echo "Deleting HF token secret..." envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true fi -# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it +# Delete any other template files that might exist +echo "Checking for additional template files..." +for template in ./*.yaml.template; do + if [ -f "$template" ]; then + # Skip templates we've already processed + if [[ "$template" != "./ui-k8s.yaml.template" && + "$template" != "./monitoring-k8s.yaml.template" && + "$template" != "./ingress-k8s.yaml.template" && + "$template" != "./stack-k8s.yaml.template" && + "$template" != "./chroma-k8s.yaml.template" && + "$template" != "./postgres-k8s.yaml.template" && + "$template" != "./llama-nim.yaml.template" && + "$template" != "./vllm-safety-k8s.yaml.template" && + "$template" != "./ollama-safety-k8s.yaml.template" && + "$template" != "./vllm-k8s.yaml.template" && + "$template" != "./set-secret.yaml.template" && + "$template" != "./ui-service-k8s.yaml.template" ]]; then + echo "Deleting resources from $template..." + envsubst < "$template" | kubectl delete -f - --ignore-not-found=true + fi + fi +done -echo "All LlamaStack Kubernetes resources have been deleted." +# Delete any PersistentVolumeClaims created by the stack +echo "Deleting PersistentVolumeClaims..." +kubectl delete pvc -l app=llama-stack --ignore-not-found=true +kubectl delete pvc -l app=chroma --ignore-not-found=true +kubectl delete pvc -l app=postgres --ignore-not-found=true +kubectl delete pvc -l app=vllm --ignore-not-found=true +kubectl delete pvc -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true + +# Delete any remaining services +echo "Deleting any remaining services..." +kubectl delete service -l app=llama-stack --ignore-not-found=true +kubectl delete service -l app=chroma --ignore-not-found=true +kubectl delete service -l app=postgres --ignore-not-found=true +kubectl delete service -l app=vllm --ignore-not-found=true +kubectl delete service -l app=llama-nim --ignore-not-found=true +kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true + +# Delete any remaining secrets +echo "Deleting any remaining secrets..." +kubectl delete secret hf-secret --ignore-not-found=true +kubectl delete secret ngc-secret --ignore-not-found=true +kubectl delete secret -l app=llama-stack --ignore-not-found=true + +# Verify no resources remain +echo "Verifying deletion..." +REMAINING_RESOURCES=$(kubectl get all -l app=llama-stack 2>/dev/null) +if [ -z "$REMAINING_RESOURCES" ]; then + echo "All LlamaStack Kubernetes resources have been successfully deleted." +else + echo "Some LlamaStack resources may still exist. Please check manually with:" + echo "kubectl get all -l app=llama-stack" +fi diff --git a/docs/source/distributions/k8s/monitoring-k8s.yaml.template b/docs/source/distributions/k8s/monitoring-k8s.yaml.template index 20912fd64..c9e7dd759 100644 --- a/docs/source/distributions/k8s/monitoring-k8s.yaml.template +++ b/docs/source/distributions/k8s/monitoring-k8s.yaml.template @@ -18,7 +18,7 @@ data: scrape_configs: # NVIDIA DCGM exporter for GPU metrics - - job_name: 'nvidia-dcgm-exporter' + - job_name: 'dcgm' static_configs: - targets: ['dcgm-exporter:9400'] @@ -98,6 +98,30 @@ data: - targets: ['llm-nim-code:8000'] metrics_path: /v1/metrics scrape_interval: 5s + + # Specific job for Ollama pods + - job_name: 'ollama-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + regex: ollama-safety + action: keep + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name --- # Prometheus Deployment apiVersion: apps/v1 @@ -262,9 +286,9 @@ data: "spaceLength": 10, "stack": false, "steppedLine": false, - "targets": [ + "targets": [ { - "expr": "DCGM_FI_DEV_GPU_UTIL{pod=~\"llm-nim-code.*\"}", + "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } @@ -351,7 +375,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED{pod=~\"llm-nim-code.*\"} / 1024 / 1024 / 1024", + "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"} / 1024 / 1024 / 1024", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } @@ -438,7 +462,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_POWER_USAGE{pod=~\"llm-nim-code.*\"}", + "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } @@ -525,7 +549,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_GPU_TEMP{pod=~\"llm-nim-code.*\"}", + "expr": "DCGM_FI_DEV_GPU_TEMP{instance=\"dcgm-exporter:9400\", pod=~\"(llm-nim-code|vllm).*\"}", "refId": "A", "legendFormat": "GPU {{gpu}} - {{pod}}" } @@ -954,6 +978,763 @@ data: "uid": "nim-dashboard", "version": 1 } + ollama-dashboard.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ollama_requests_total{kubernetes_pod_name=~\"ollama-safety.*\"}[1m])", + "refId": "A", + "legendFormat": "{{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama Request Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))", + "refId": "A", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(ollama_request_duration_seconds_bucket{kubernetes_pod_name=~\"ollama-safety.*\"}[5m])) by (le))", + "refId": "B", + "legendFormat": "p50" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ollama_active_requests{kubernetes_pod_name=~\"ollama-safety.*\"}", + "refId": "A", + "legendFormat": "{{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama Active Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ollama_tokens_per_second{kubernetes_pod_name=~\"ollama-safety.*\"}", + "refId": "A", + "legendFormat": "{{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama Tokens Per Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(kubernetes_pod_name) (container_memory_usage_bytes{pod=~\"ollama-safety.*\"})", + "refId": "A", + "legendFormat": "{{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{pod=~\"ollama-safety.*\"}[5m]))", + "refId": "A", + "legendFormat": "{{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ollama_gpu_memory_used_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}", + "refId": "A", + "legendFormat": "Used - {{kubernetes_pod_name}}" + }, + { + "expr": "ollama_gpu_memory_total_bytes{kubernetes_pod_name=~\"ollama-safety.*\"}", + "refId": "B", + "legendFormat": "Total - {{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama GPU Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ollama_gpu_utilization{kubernetes_pod_name=~\"ollama-safety.*\"}", + "refId": "A", + "legendFormat": "{{kubernetes_pod_name}}" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ollama GPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Ollama Monitoring", + "uid": "ollama-dashboard", + "version": 1 + } k8s-pods-dashboard.json: | { "annotations": { @@ -1378,8 +2159,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "gpu_utilization{model_name=~\".+\"}", - "refId": "A" + "expr": "DCGM_FI_DEV_GPU_UTIL{instance=\"dcgm-exporter:9400\"}", + "refId": "A", + "legendFormat": "GPU {{gpu}}" } ], "thresholds": [], @@ -1464,8 +2246,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "gpu_memory_used_bytes{model_name=~\".+\"} / gpu_memory_total_bytes{model_name=~\".+\"} * 100", - "refId": "A" + "expr": "DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} / (DCGM_FI_DEV_FB_USED{instance=\"dcgm-exporter:9400\"} + DCGM_FI_DEV_FB_FREE{instance=\"dcgm-exporter:9400\"}) * 100", + "refId": "A", + "legendFormat": "GPU {{gpu}}" } ], "thresholds": [], @@ -1550,8 +2333,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "gpu_power_usage_watts{model_name=~\".+\"}", - "refId": "A" + "expr": "DCGM_FI_DEV_POWER_USAGE{instance=\"dcgm-exporter:9400\"}", + "refId": "A", + "legendFormat": "GPU {{gpu}}" } ], "thresholds": [], @@ -2298,20 +3082,65 @@ spec: spec: containers: - name: dcgm-exporter - image: nvidia/dcgm-exporter:2.4.6-2.6.10-ubuntu20.04 + image: nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.3.0-ubuntu22.04 securityContext: runAsNonRoot: false runAsUser: 0 ports: - containerPort: 9400 name: metrics + args: + - -f + - /etc/dcgm-exporter/dcp-metrics-included.csv volumeMounts: - name: device-metrics mountPath: /dev/metrics + - name: dcgm-config + mountPath: /etc/dcgm-exporter volumes: - name: device-metrics hostPath: path: /dev/metrics + - name: dcgm-config + configMap: + name: dcgm-config +--- +# DCGM Exporter ConfigMap for metrics configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: dcgm-config + namespace: monitoring +data: + dcp-metrics-included.csv: | + # Format: + # DCGM_FI_DEV_, , + DCGM_FI_DEV_GPU_UTIL, GPU utilization, % + DCGM_FI_DEV_MEM_COPY_UTIL, Memory utilization, % + DCGM_FI_DEV_FB_FREE, Framebuffer memory free, MiB + DCGM_FI_DEV_FB_USED, Framebuffer memory used, MiB + DCGM_FI_DEV_FB_TOTAL, Framebuffer memory total, MiB + DCGM_FI_DEV_GPU_TEMP, GPU temperature, C + DCGM_FI_DEV_POWER_USAGE, Power usage, W + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, Total energy consumption, mJ + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, PCIe replay counter, count + DCGM_FI_DEV_PCIE_TX_THROUGHPUT, PCIe transmit throughput, KiB/s + DCGM_FI_DEV_PCIE_RX_THROUGHPUT, PCIe receive throughput, KiB/s + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, NVLink bandwidth total, KiB/s + DCGM_FI_DEV_VGPU_LICENSE_STATUS, vGPU license status, N/A + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, Uncorrectable remapped rows, count + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, Correctable remapped rows, count + DCGM_FI_DEV_ROW_REMAP_FAILURE, Row remap failure, count + DCGM_FI_DEV_XID_ERRORS, XID errors, count + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, NVLink CRC flit error count total, count + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, NVLink CRC data error count total, count + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, NVLink replay error count total, count + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, NVLink recovery error count total, count + DCGM_FI_DEV_RETIRED_PAGES_SBE, Retired pages SBE, count + DCGM_FI_DEV_RETIRED_PAGES_DBE, Retired pages DBE, count + DCGM_FI_DEV_RETIRED_PAGES_PENDING, Retired pages pending, count + DCGM_FI_DEV_GRAPHICS_PIDS, Graphics processes, count + DCGM_FI_DEV_COMPUTE_PIDS, Compute processes, count --- # NVIDIA DCGM Exporter Service apiVersion: v1 diff --git a/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template new file mode 100644 index 000000000..6519977e8 --- /dev/null +++ b/docs/source/distributions/k8s/ollama-safety-k8s.yaml.template @@ -0,0 +1,74 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ollama-models-safety +spec: + accessModes: + - ReadWriteOnce + volumeMode: Filesystem + resources: + requests: + storage: 10Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama-safety +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: ollama-safety + template: + metadata: + labels: + app.kubernetes.io/name: ollama-safety + workload-type: inference + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "11434" + prometheus.io/path: "/metrics" + spec: + containers: + - name: ollama-safety + image: ollama/ollama:latest + command: ["/bin/sh", "-c"] + args: [ + "ollama serve & sleep 5 && ollama pull llama-guard3:1b && ollama run llama-guard3:1b & wait" + ] + env: + - name: OLLAMA_HOST + value: "0.0.0.0" + - name: OLLAMA_PORT + value: "11434" + - name: OLLAMA_ENABLE_METRICS + value: "true" + ports: + - containerPort: 11434 + resources: + requests: + memory: "8Gi" + cpu: "6000m" + limits: + memory: "16Gi" + cpu: "6000m" + volumeMounts: + - name: ollama-storage + mountPath: /root/.ollama + volumes: + - name: ollama-storage + persistentVolumeClaim: + claimName: ollama-models-safety +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama-server-safety +spec: + selector: + app.kubernetes.io/name: ollama-safety + ports: + - protocol: TCP + port: 11434 + targetPort: 11434 + type: ClusterIP diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml index 081acdce8..686171b5d 100644 --- a/docs/source/distributions/k8s/stack-configmap.yaml +++ b/docs/source/distributions/k8s/stack-configmap.yaml @@ -25,6 +25,10 @@ data: url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1} api_key: ${env.NVIDIA_API_KEY:=} append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} + - provider_id: ollama-safety + provider_type: remote::ollama + config: + url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -108,6 +112,12 @@ data: model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b} provider_id: nvidia model_type: llm + - metadata: {} + model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b} + provider_id: ollama-safety + model_type: llm + shields: + - shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: [] diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 47fcade0c..8e1569e3a 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -55,6 +55,8 @@ spec: value: "3072" - name: NVIDIA_BASE_URL value: http://llm-nim-code.default.svc.cluster.local:8000 + - name: OLLAMA_BASE_URL + value: http://ollama-safety.default.svc.cluster.local:8000 - name: POSTGRES_HOST value: postgres-server.default.svc.cluster.local - name: POSTGRES_PORT @@ -67,6 +69,8 @@ spec: value: "${CODE_MODEL}" - name: TAVILY_SEARCH_API_KEY value: "${TAVILY_SEARCH_API_KEY}" + - name: OLLAMA_MODLE + value: "${OLLAMA_MODEL}" command: ["/bin/sh"] args: - -c diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml index 696b786b3..6b4a73481 100644 --- a/docs/source/distributions/k8s/stack_run_config.yaml +++ b/docs/source/distributions/k8s/stack_run_config.yaml @@ -22,6 +22,10 @@ providers: url: ${env.NVIDIA_BASE_URL:=http://localhost:8000/v1} api_key: ${env.NVIDIA_API_KEY:=} append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} + - provider_id: ollama-safety + provider_type: remote::ollama + config: + url: ${env.OLLAMA_BASE_URL:=http://localhost:11434/v1} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -105,6 +109,12 @@ models: model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b} provider_id: nvidia model_type: llm +- metadata: {} + model_id: ${env.OLLAMA_MODEL:=llama-guard3:1b} + provider_id: ollama-safety + model_type: llm +shields: +- shield_id: ${env.OLLAMA_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: []