chore(perf): run guidellm benchmarks (#3421)
Some checks failed
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 2s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 4s
API Conformance Tests / check-schema-compatibility (push) Successful in 8s
Test External API and Providers / test-external (venv) (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 40s
Pre-commit / pre-commit (push) Successful in 1m9s

# What does this PR do?
- Mostly AI-generated scripts to run guidellm
(https://github.com/vllm-project/guidellm) benchmarks on k8s setup
- Stack is using image built from main on 9/11


## Test Plan
See updated README.md
This commit is contained in:
ehhuang 2025-09-24 10:18:33 -07:00 committed by GitHub
parent 2f58d87c22
commit 48a551ecbc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1436 additions and 526 deletions

View file

@ -0,0 +1,294 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# /// script
# dependencies = [
# "matplotlib",
# ]
# ///
"""
Script to generate benchmark charts from guidellm text results.
Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
"""
import glob
import os
import re
import matplotlib.pyplot as plt
def extract_setup_name(filename: str) -> str:
"""Extract setup name from filename and format legend appropriately."""
basename = os.path.basename(filename)
# Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
if match:
stack_replicas = match.group(1)
workers = match.group(2)
vllm_replicas = match.group(3)
date = match.group(4)
time = match.group(5)
return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"
# Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
if match:
vllm_replicas = match.group(1)
date = match.group(2)
time = match.group(3)
return f"vllm-v{vllm_replicas}"
# Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
if match:
target = match.group(1)
stack_replicas = match.group(2)
workers = match.group(3)
vllm_replicas = match.group(4)
date = match.group(5)
time = match.group(6)
if target == "vllm":
return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
else:
return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"
# Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
if match:
target = match.group(1)
stack_replicas = match.group(2)
vllm_replicas = match.group(3)
date = match.group(4)
time = match.group(5)
if target == "vllm":
return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
else:
return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"
return basename.replace("guidellm-benchmark-", "").replace(".txt", "")
def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
"""
Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
"""
setup_name = extract_setup_name(filepath)
data_points = []
try:
with open(filepath) as f:
content = f.read()
# Find the benchmark stats table
lines = content.split("\n")
in_stats_table = False
header_lines_seen = 0
for line in lines:
line_stripped = line.strip()
# Look for the start of the stats table
if "Benchmarks Stats:" in line:
in_stats_table = True
continue
if in_stats_table:
# Skip the first few separator/header lines
if line_stripped.startswith("=") or line_stripped.startswith("-"):
header_lines_seen += 1
if header_lines_seen >= 3: # After seeing multiple header lines, look for concurrent@ data
if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
break
continue
# Parse concurrent@ lines in the stats table (may have leading spaces)
if in_stats_table and "concurrent@" in line:
parts = [part.strip() for part in line.split("|")]
if len(parts) >= 12: # Make sure we have enough columns for new format
try:
# Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
if not concurrent_match:
continue
concurrency = float(concurrent_match.group(1))
# Extract metrics from the new table format
# From your image, the table has these columns with | separators:
# Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
# Looking at the mean/median/p99 structure, need to find the mean columns
# The structure shows: mean | median | p99 for each metric
rps_mean = float(parts[1]) # Per Second (RPS)
req_latency_mean = float(parts[6]) * 1000 # Request latency mean (convert from sec to ms)
ttft_mean = float(parts[9]) # TTFT mean column
itl_mean = float(parts[12]) # ITL mean column
data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))
except (ValueError, IndexError) as e:
print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
continue
except (OSError, FileNotFoundError) as e:
print(f"Error reading {filepath}: {e}")
return data_points
def generate_charts(benchmark_dir: str = "results"):
"""Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
# Find all text result files instead of JSON
txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
txt_files = glob.glob(txt_pattern)
if not txt_files:
print(f"No text files found matching pattern: {txt_pattern}")
return
print(f"Found {len(txt_files)} text files")
# Parse all files and collect data
all_data = {} # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]
for txt_file in txt_files:
print(f"Processing {txt_file}")
data_points = parse_txt_file(txt_file)
for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
if setup_name not in all_data:
all_data[setup_name] = []
all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))
if not all_data:
print("No data found to plot")
return
# Sort data points by concurrency for each setup
for setup_name in all_data:
all_data[setup_name].sort(key=lambda x: x[0]) # Sort by concurrency
# Group setups by vLLM replica number (original approach)
replica_groups = {} # vllm_replica_count -> {setup_name: points}
for setup_name, points in all_data.items():
# Extract vLLM replica number from setup name
# Expected formats:
# - New stack format: "stack-s{X}-sw{W}-v{Y}"
# - New vLLM format: "vllm-v{Y}"
# - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"
# Try new formats first
vllm_match = re.search(r"-v(\d+)$", setup_name) # Matches both "stack-s1-sw2-v3" and "vllm-v1"
if not vllm_match:
# Try old stack format
vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
if not vllm_match:
# Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)
if vllm_match:
vllm_replica_num = int(vllm_match.group(1))
if vllm_replica_num not in replica_groups:
replica_groups[vllm_replica_num] = {}
replica_groups[vllm_replica_num][setup_name] = points
else:
print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")
def create_charts(data_dict, prefix, title_prefix):
"""Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
if not data_dict:
print(f"No data found for {prefix}")
return
# Create 2x2 subplot grid
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")
# Collect all unique concurrency values for tick setting
all_concurrency_values = set()
for points in data_dict.values():
all_concurrency_values.update([p[0] for p in points])
all_concurrency_values = sorted(all_concurrency_values)
# Plot data for each setup in alphabetical order
for setup_name in sorted(data_dict.keys()):
points = data_dict[setup_name]
if not points:
continue
concurrency_values = [p[0] for p in points]
rps_values = [p[1] for p in points]
ttft_values = [p[2] for p in points]
itl_values = [p[3] for p in points]
req_latency_values = [p[4] for p in points]
# RPS chart (top-left)
ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)
# Request Latency chart (top-right)
ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)
# TTFT chart (bottom-left)
ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)
# ITL chart (bottom-right)
ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)
# Configure all charts after plotting data
axes = [ax1, ax2, ax3, ax4]
titles = ["RPS", "Request Latency", "TTFT", "ITL"]
ylabels = [
"Requests Per Second (RPS)",
"Request Latency (ms)",
"Time to First Token (ms)",
"Inter Token Latency (ms)",
]
for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
ax.set_xlabel("Concurrency", fontsize=12)
ax.set_ylabel(ylabel, fontsize=12)
ax.set_title(title, fontsize=14, fontweight="bold")
ax.set_xscale("log", base=2)
ax.set_xticks(all_concurrency_values)
ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
ax.grid(True, alpha=0.3)
# Add legend to the right-most subplot (top-right)
ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
# Save the combined chart
combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
plt.close()
print(f"Combined benchmark chart saved to {combined_filename}")
# Print grouping information
for replica_count, data_dict in replica_groups.items():
print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")
# Create separate charts for each replica group
for replica_count, data_dict in replica_groups.items():
prefix = f"vllm_replica{replica_count}"
title = f"vLLM Replicas={replica_count}"
create_charts(data_dict, prefix, title)
# Print summary
print("\nSummary:")
for setup_name, points in all_data.items():
print(f"{setup_name}: {len(points)} data points")
if __name__ == "__main__":
generate_charts()

View file

@ -0,0 +1,103 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Define benchmark configurations: (target, stack_replicas, vllm_replicas, stack_workers)
configs=(
"stack 1 1 1"
"stack 1 1 2"
"stack 1 1 4"
"vllm 1 1 -"
)
set -euo pipefail
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Running comprehensive GuideLL benchmark suite..."
echo "Start time: $(date)"
# Default deployment names
STACK_DEPLOYMENT="llama-stack-benchmark-server"
VLLM_DEPLOYMENT="vllm-server"
# Scaling function
scale_deployments() {
local stack_replicas=$1
local vllm_replicas=$2
local workers=$3
echo "Scaling deployments..."
if [[ "$vllm_replicas" != "-" ]]; then
echo "Scaling $VLLM_DEPLOYMENT to $vllm_replicas replicas..."
kubectl scale deployment $VLLM_DEPLOYMENT --replicas=$vllm_replicas
kubectl rollout status deployment $VLLM_DEPLOYMENT --timeout=600s
fi
if [[ "$target" == "stack" ]]; then
if [[ "$stack_replicas" != "-" ]]; then
echo "Scaling $STACK_DEPLOYMENT to $stack_replicas replicas..."
kubectl scale deployment $STACK_DEPLOYMENT --replicas=$stack_replicas
kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
fi
if [[ "$workers" != "-" ]]; then
echo "Updating $STACK_DEPLOYMENT to use $workers workers..."
kubectl set env deployment/$STACK_DEPLOYMENT LLAMA_STACK_WORKERS=$workers
kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
fi
fi
echo "All scaling operations completed. Waiting additional 30s for services to stabilize..."
sleep 30
}
for config in "${configs[@]}"; do
read -r target stack_replicas vllm_replicas workers <<< "$config"
echo ""
echo "=========================================="
if [[ "$workers" != "-" ]]; then
echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas, workers=$workers)"
else
echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas)"
fi
echo "Start: $(date)"
echo "=========================================="
# Scale deployments before running benchmark
scale_deployments "$stack_replicas" "$vllm_replicas" "$workers"
# Generate output filename with setup info
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
if [[ "$target" == "stack" ]]; then
OUTPUT_FILE="results/guidellm-benchmark-${target}-s${stack_replicas}-sw${workers}-v${vllm_replicas}-${TIMESTAMP}.txt"
else
OUTPUT_FILE="results/guidellm-benchmark-${target}-v${vllm_replicas}-${TIMESTAMP}.txt"
fi
# Run the benchmark with the cluster as configured
"$SCRIPT_DIR/run-guidellm-benchmark.sh" \
--target "$target" \
--output-file "$OUTPUT_FILE"
echo "Completed: $(date)"
echo "Waiting 30 seconds before next benchmark..."
sleep 30
done
echo ""
echo "=========================================="
echo "All benchmarks completed!"
echo "End time: $(date)"
echo "=========================================="
echo ""
echo "Results files generated:"
ls -la results/guidellm-*.txt results/guidellm-*.json 2>/dev/null || echo "No result files found"

View file

@ -0,0 +1,219 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -euo pipefail
# Default values
TARGET="stack"
MAX_SECONDS=60
PROMPT_TOKENS=512
OUTPUT_TOKENS=256
RATE_TYPE="concurrent"
RATE="1,2,4,8,16,32,64,128"
STACK_DEPLOYMENT="llama-stack-benchmark-server"
STACK_URL="http://llama-stack-benchmark-service:8323/v1/openai"
VLLM_DEPLOYMENT="vllm-server"
OUTPUT_FILE=""
# Parse command line arguments
usage() {
echo "Usage: $0 [options]"
echo "Options:"
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
echo " -s, --max-seconds <seconds> Maximum duration in seconds (default: 60)"
echo " -p, --prompt-tokens <tokens> Number of prompt tokens (default: 512)"
echo " -o, --output-tokens <tokens> Number of output tokens (default: 256)"
echo " -r, --rate-type <type> Rate type (default: concurrent)"
echo " -c, --rate Rate (default: 1,2,4,8,16,32,64,128)"
echo " --output-file <path> Output file path (default: auto-generated)"
echo " --stack-deployment <name> Name of the stack deployment (default: llama-stack-benchmark-server)"
echo " --vllm-deployment <name> Name of the vllm deployment (default: vllm-server)"
echo " --stack-url <url> URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --target vllm # Benchmark vLLM direct"
echo " $0 --target stack # Benchmark Llama Stack (default)"
echo " $0 -t vllm -s 60 -p 512 -o 256 # vLLM with custom parameters"
echo " $0 --output-file results/my-benchmark.txt # Specify custom output file"
echo " $0 --stack-deployment my-stack-server # Use custom stack deployment name"
}
while [[ $# -gt 0 ]]; do
case $1 in
-t|--target)
TARGET="$2"
shift 2
;;
-s|--max-seconds)
MAX_SECONDS="$2"
shift 2
;;
-p|--prompt-tokens)
PROMPT_TOKENS="$2"
shift 2
;;
-o|--output-tokens)
OUTPUT_TOKENS="$2"
shift 2
;;
-r|--rate-type)
RATE_TYPE="$2"
shift 2
;;
-c|--rate)
RATE="$2"
shift 2
;;
--output-file)
OUTPUT_FILE="$2"
shift 2
;;
--stack-deployment)
STACK_DEPLOYMENT="$2"
shift 2
;;
--vllm-deployment)
VLLM_DEPLOYMENT="$2"
shift 2
;;
--stack-url)
STACK_URL="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
# Validate target
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
echo "Error: Target must be 'stack' or 'vllm'"
usage
exit 1
fi
# Set configuration based on target
if [[ "$TARGET" == "vllm" ]]; then
BASE_URL="http://${VLLM_DEPLOYMENT}:8000"
JOB_NAME="guidellm-vllm-benchmark-job"
echo "Benchmarking vLLM direct with GuideLLM..."
else
BASE_URL="$STACK_URL"
JOB_NAME="guidellm-stack-benchmark-job"
echo "Benchmarking Llama Stack with GuideLLM..."
fi
echo "Configuration:"
echo " Target: $TARGET"
echo " Base URL: $BASE_URL"
echo " Max seconds: ${MAX_SECONDS}s"
echo " Prompt tokens: $PROMPT_TOKENS"
echo " Output tokens: $OUTPUT_TOKENS"
echo " Rate type: $RATE_TYPE"
if [[ "$TARGET" == "vllm" ]]; then
echo " vLLM deployment: $VLLM_DEPLOYMENT"
else
echo " Stack deployment: $STACK_DEPLOYMENT"
fi
echo ""
# Create temporary job yaml
TEMP_YAML="/tmp/guidellm-benchmark-job-temp-$(date +%s).yaml"
cat > "$TEMP_YAML" << EOF
apiVersion: batch/v1
kind: Job
metadata:
name: $JOB_NAME
namespace: default
spec:
template:
spec:
containers:
- name: guidellm-benchmark
image: python:3.11-slim
command: ["/bin/bash"]
args:
- "-c"
- |
# Install uv and guidellm
pip install uv &&
uv pip install --system guidellm &&
# Login to HuggingFace
uv pip install --system huggingface_hub &&
python -c "from huggingface_hub import login; login(token='\$HF_TOKEN')" &&
# Run GuideLLM benchmark and save output
export COLUMNS=200
GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \\
--target "$BASE_URL" \\
--rate-type "$RATE_TYPE" \\
--max-seconds $MAX_SECONDS \\
--data "prompt_tokens=$PROMPT_TOKENS,output_tokens=$OUTPUT_TOKENS" \\
--model "$INFERENCE_MODEL" \\
--rate "$RATE" \\
--warmup-percent 0.05 \\
2>&1
env:
- name: INFERENCE_MODEL
value: "meta-llama/Llama-3.2-3B-Instruct"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
resources:
requests:
memory: "4Gi"
cpu: "500m"
limits:
memory: "8Gi"
cpu: "2000m"
restartPolicy: Never
backoffLimit: 3
EOF
echo "Cleaning up any existing GuideLLM benchmark job..."
kubectl delete job $JOB_NAME 2>/dev/null || true
echo "Deploying GuideLLM benchmark Job..."
kubectl apply -f "$TEMP_YAML"
echo "Waiting for job to start..."
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=120s
# Prepare file names and create results directory
mkdir -p results
if [[ -z "$OUTPUT_FILE" ]]; then
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
OUTPUT_FILE="results/guidellm-benchmark-${TARGET}-${TIMESTAMP}.txt"
fi
echo "Following GuideLLM benchmark logs..."
kubectl logs -f job/$JOB_NAME
echo "Job completed. Checking final status..."
kubectl get job $JOB_NAME
# Save benchmark results using kubectl logs
echo "Saving benchmark results..."
kubectl logs job/$JOB_NAME > "$OUTPUT_FILE"
echo "Benchmark output saved to: $OUTPUT_FILE"
# Clean up temporary file
rm -f "$TEMP_YAML"