mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-21 17:33:12 +00:00
test: benchmark scripts (#3160)
# What does this PR do? 1. Add our own benchmark script instead of locust (doesn't support measuring streaming latency well) 2. Simplify k8s deployment 3. Add a simple profile script for locally running server ## Test Plan ❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10 ============================================================ BENCHMARK RESULTS ============================================================ Total time: 180.00s Concurrent users: 10 Total requests: 1636 Successful requests: 1636 Failed requests: 0 Success rate: 100.0% Requests per second: 9.09 Response Time Statistics: Mean: 1.095s Median: 1.721s Min: 0.136s Max: 3.218s Std Dev: 0.762s Percentiles: P50: 1.721s P90: 1.751s P95: 1.756s P99: 1.796s Time to First Token (TTFT) Statistics: Mean: 0.037s Median: 0.037s Min: 0.023s Max: 0.211s Std Dev: 0.011s TTFT Percentiles: P50: 0.037s P90: 0.040s P95: 0.044s P99: 0.055s Streaming Statistics: Mean chunks per response: 64.0 Total chunks received: 104775
This commit is contained in:
parent
2114214fe3
commit
2c06b24c77
13 changed files with 633 additions and 328 deletions
148
docs/source/distributions/k8s-benchmark/run-benchmark.sh
Executable file
148
docs/source/distributions/k8s-benchmark/run-benchmark.sh
Executable file
|
@ -0,0 +1,148 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default values
|
||||
TARGET="stack"
|
||||
DURATION=60
|
||||
CONCURRENT=10
|
||||
|
||||
# Parse command line arguments
|
||||
usage() {
|
||||
echo "Usage: $0 [options]"
|
||||
echo "Options:"
|
||||
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
||||
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
|
||||
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 --target vllm # Benchmark vLLM direct"
|
||||
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
||||
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-t|--target)
|
||||
TARGET="$2"
|
||||
shift 2
|
||||
;;
|
||||
-d|--duration)
|
||||
DURATION="$2"
|
||||
shift 2
|
||||
;;
|
||||
-c|--concurrent)
|
||||
CONCURRENT="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate target
|
||||
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
||||
echo "Error: Target must be 'stack' or 'vllm'"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Set configuration based on target
|
||||
if [[ "$TARGET" == "vllm" ]]; then
|
||||
BASE_URL="http://vllm-server:8000/v1"
|
||||
JOB_NAME="vllm-benchmark-job"
|
||||
echo "Benchmarking vLLM direct..."
|
||||
else
|
||||
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
|
||||
JOB_NAME="stack-benchmark-job"
|
||||
echo "Benchmarking Llama Stack..."
|
||||
fi
|
||||
|
||||
echo "Configuration:"
|
||||
echo " Target: $TARGET"
|
||||
echo " Base URL: $BASE_URL"
|
||||
echo " Duration: ${DURATION}s"
|
||||
echo " Concurrent users: $CONCURRENT"
|
||||
echo ""
|
||||
|
||||
# Create temporary job yaml
|
||||
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
|
||||
cat > "$TEMP_YAML" << EOF
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: $JOB_NAME
|
||||
namespace: default
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: benchmark
|
||||
image: python:3.11-slim
|
||||
command: ["/bin/bash"]
|
||||
args:
|
||||
- "-c"
|
||||
- |
|
||||
pip install aiohttp &&
|
||||
python3 /benchmark/benchmark.py \\
|
||||
--base-url $BASE_URL \\
|
||||
--model \${INFERENCE_MODEL} \\
|
||||
--duration $DURATION \\
|
||||
--concurrent $CONCURRENT
|
||||
env:
|
||||
- name: INFERENCE_MODEL
|
||||
value: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
volumeMounts:
|
||||
- name: benchmark-script
|
||||
mountPath: /benchmark
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
volumes:
|
||||
- name: benchmark-script
|
||||
configMap:
|
||||
name: benchmark-script
|
||||
restartPolicy: Never
|
||||
backoffLimit: 3
|
||||
EOF
|
||||
|
||||
echo "Creating benchmark ConfigMap..."
|
||||
kubectl create configmap benchmark-script \
|
||||
--from-file=benchmark.py=benchmark.py \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
echo "Cleaning up any existing benchmark job..."
|
||||
kubectl delete job $JOB_NAME 2>/dev/null || true
|
||||
|
||||
echo "Deploying benchmark Job..."
|
||||
kubectl apply -f "$TEMP_YAML"
|
||||
|
||||
echo "Waiting for job to start..."
|
||||
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
|
||||
|
||||
echo "Following benchmark logs..."
|
||||
kubectl logs -f job/$JOB_NAME
|
||||
|
||||
echo "Job completed. Checking final status..."
|
||||
kubectl get job $JOB_NAME
|
||||
|
||||
# Clean up temporary file
|
||||
rm -f "$TEMP_YAML"
|
Loading…
Add table
Add a link
Reference in a new issue