test: benchmark scripts (#3160)

# What does this PR do? 1. Add our own benchmark script instead of locust (doesn't support measuring streaming latency well) 2. Simplify k8s deployment 3. Add a simple profile script for locally running server ## Test Plan ❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10 ============================================================ BENCHMARK RESULTS ============================================================ Total time: 180.00s Concurrent users: 10 Total requests: 1636 Successful requests: 1636 Failed requests: 0 Success rate: 100.0% Requests per second: 9.09 Response Time Statistics: Mean: 1.095s Median: 1.721s Min: 0.136s Max: 3.218s Std Dev: 0.762s Percentiles: P50: 1.721s P90: 1.751s P95: 1.756s P99: 1.796s Time to First Token (TTFT) Statistics: Mean: 0.037s Median: 0.037s Min: 0.023s Max: 0.211s Std Dev: 0.011s TTFT Percentiles: P50: 0.037s P90: 0.040s P95: 0.044s P99: 0.055s Streaming Statistics: Mean chunks per response: 64.0 Total chunks received: 104775
2025-12-05 02:17:31 +00:00 · 2025-08-15 11:24:29 -07:00 · 2025-08-15 11:24:29 -07:00 · 2c06b24c77
commit 2c06b24c77
parent 2114214fe3
13 changed files with 633 additions and 328 deletions
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+# Default values
+TARGET="stack"
+DURATION=60
+CONCURRENT=10
+
+# Parse command line arguments
+usage() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
+    echo "  -d, --duration <seconds>      Duration in seconds (default: 60)"
+    echo "  -c, --concurrent <users>      Number of concurrent users (default: 10)"
+    echo "  -h, --help                    Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --target vllm              # Benchmark vLLM direct"
+    echo "  $0 --target stack             # Benchmark Llama Stack (default)"
+    echo "  $0 -t vllm -d 120 -c 20       # vLLM with 120s duration, 20 users"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -t|--target)
+            TARGET="$2"
+            shift 2
+            ;;
+        -d|--duration)
+            DURATION="$2"
+            shift 2
+            ;;
+        -c|--concurrent)
+            CONCURRENT="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Validate target
+if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
+    echo "Error: Target must be 'stack' or 'vllm'"
+    usage
+    exit 1
+fi
+
+# Set configuration based on target
+if [[ "$TARGET" == "vllm" ]]; then
+    BASE_URL="http://vllm-server:8000/v1"
+    JOB_NAME="vllm-benchmark-job"
+    echo "Benchmarking vLLM direct..."
+else
+    BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
+    JOB_NAME="stack-benchmark-job"
+    echo "Benchmarking Llama Stack..."
+fi
+
+echo "Configuration:"
+echo "  Target: $TARGET"
+echo "  Base URL: $BASE_URL"
+echo "  Duration: ${DURATION}s"
+echo "  Concurrent users: $CONCURRENT"
+echo ""
+
+# Create temporary job yaml
+TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
+cat > "$TEMP_YAML" << EOF
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: $JOB_NAME
+  namespace: default
+spec:
+  template:
+    spec:
+      containers:
+      - name: benchmark
+        image: python:3.11-slim
+        command: ["/bin/bash"]
+        args:
+        - "-c"
+        - |
+          pip install aiohttp &&
+          python3 /benchmark/benchmark.py \\
+            --base-url $BASE_URL \\
+            --model \${INFERENCE_MODEL} \\
+            --duration $DURATION \\
+            --concurrent $CONCURRENT
+        env:
+        - name: INFERENCE_MODEL
+          value: "meta-llama/Llama-3.2-3B-Instruct"
+        volumeMounts:
+        - name: benchmark-script
+          mountPath: /benchmark
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "250m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+      volumes:
+      - name: benchmark-script
+        configMap:
+          name: benchmark-script
+      restartPolicy: Never
+  backoffLimit: 3
+EOF
+
+echo "Creating benchmark ConfigMap..."
+kubectl create configmap benchmark-script \
+  --from-file=benchmark.py=benchmark.py \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+echo "Cleaning up any existing benchmark job..."
+kubectl delete job $JOB_NAME 2>/dev/null || true
+
+echo "Deploying benchmark Job..."
+kubectl apply -f "$TEMP_YAML"
+
+echo "Waiting for job to start..."
+kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
+
+echo "Following benchmark logs..."
+kubectl logs -f job/$JOB_NAME
+
+echo "Job completed. Checking final status..."
+kubectl get job $JOB_NAME
+
+# Clean up temporary file
+rm -f "$TEMP_YAML"