llama-stack-mirror/docs/source/distributions/k8s-benchmark/run-benchmark.sh
ehhuang 2c06b24c77
test: benchmark scripts (#3160)
# What does this PR do?
1. Add our own benchmark script instead of locust (doesn't support
measuring streaming latency well)
2. Simplify k8s deployment
3. Add a simple profile script for locally running server

## Test Plan
❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10

============================================================
BENCHMARK RESULTS
============================================================
Total time: 180.00s
Concurrent users: 10
Total requests: 1636
Successful requests: 1636
Failed requests: 0
Success rate: 100.0%
Requests per second: 9.09

Response Time Statistics:
  Mean: 1.095s
  Median: 1.721s
  Min: 0.136s
  Max: 3.218s
  Std Dev: 0.762s

Percentiles:
  P50: 1.721s
  P90: 1.751s
  P95: 1.756s
  P99: 1.796s

Time to First Token (TTFT) Statistics:
  Mean: 0.037s
  Median: 0.037s
  Min: 0.023s
  Max: 0.211s
  Std Dev: 0.011s

TTFT Percentiles:
  P50: 0.037s
  P90: 0.040s
  P95: 0.044s
  P99: 0.055s

Streaming Statistics:
  Mean chunks per response: 64.0
  Total chunks received: 104775
2025-08-15 11:24:29 -07:00

148 lines
3.7 KiB
Bash
Executable file

#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -euo pipefail
# Default values
TARGET="stack"
DURATION=60
CONCURRENT=10
# Parse command line arguments
usage() {
echo "Usage: $0 [options]"
echo "Options:"
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --target vllm # Benchmark vLLM direct"
echo " $0 --target stack # Benchmark Llama Stack (default)"
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
}
while [[ $# -gt 0 ]]; do
case $1 in
-t|--target)
TARGET="$2"
shift 2
;;
-d|--duration)
DURATION="$2"
shift 2
;;
-c|--concurrent)
CONCURRENT="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
# Validate target
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
echo "Error: Target must be 'stack' or 'vllm'"
usage
exit 1
fi
# Set configuration based on target
if [[ "$TARGET" == "vllm" ]]; then
BASE_URL="http://vllm-server:8000/v1"
JOB_NAME="vllm-benchmark-job"
echo "Benchmarking vLLM direct..."
else
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
JOB_NAME="stack-benchmark-job"
echo "Benchmarking Llama Stack..."
fi
echo "Configuration:"
echo " Target: $TARGET"
echo " Base URL: $BASE_URL"
echo " Duration: ${DURATION}s"
echo " Concurrent users: $CONCURRENT"
echo ""
# Create temporary job yaml
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
cat > "$TEMP_YAML" << EOF
apiVersion: batch/v1
kind: Job
metadata:
name: $JOB_NAME
namespace: default
spec:
template:
spec:
containers:
- name: benchmark
image: python:3.11-slim
command: ["/bin/bash"]
args:
- "-c"
- |
pip install aiohttp &&
python3 /benchmark/benchmark.py \\
--base-url $BASE_URL \\
--model \${INFERENCE_MODEL} \\
--duration $DURATION \\
--concurrent $CONCURRENT
env:
- name: INFERENCE_MODEL
value: "meta-llama/Llama-3.2-3B-Instruct"
volumeMounts:
- name: benchmark-script
mountPath: /benchmark
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
volumes:
- name: benchmark-script
configMap:
name: benchmark-script
restartPolicy: Never
backoffLimit: 3
EOF
echo "Creating benchmark ConfigMap..."
kubectl create configmap benchmark-script \
--from-file=benchmark.py=benchmark.py \
--dry-run=client -o yaml | kubectl apply -f -
echo "Cleaning up any existing benchmark job..."
kubectl delete job $JOB_NAME 2>/dev/null || true
echo "Deploying benchmark Job..."
kubectl apply -f "$TEMP_YAML"
echo "Waiting for job to start..."
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
echo "Following benchmark logs..."
kubectl logs -f job/$JOB_NAME
echo "Job completed. Checking final status..."
kubectl get job $JOB_NAME
# Clean up temporary file
rm -f "$TEMP_YAML"