mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore(perf): run guidellm benchmarks (#3421)
Some checks failed
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 2s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 4s
API Conformance Tests / check-schema-compatibility (push) Successful in 8s
Test External API and Providers / test-external (venv) (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 40s
Pre-commit / pre-commit (push) Successful in 1m9s
Some checks failed
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 2s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 4s
API Conformance Tests / check-schema-compatibility (push) Successful in 8s
Test External API and Providers / test-external (venv) (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 40s
Pre-commit / pre-commit (push) Successful in 1m9s
# What does this PR do? - Mostly AI-generated scripts to run guidellm (https://github.com/vllm-project/guidellm) benchmarks on k8s setup - Stack is using image built from main on 9/11 ## Test Plan See updated README.md
This commit is contained in:
parent
2f58d87c22
commit
48a551ecbc
14 changed files with 1436 additions and 526 deletions
294
benchmarking/k8s-benchmark/scripts/generate_charts.py
Executable file
294
benchmarking/k8s-benchmark/scripts/generate_charts.py
Executable file
|
@ -0,0 +1,294 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "matplotlib",
|
||||
# ]
|
||||
# ///
|
||||
"""
|
||||
Script to generate benchmark charts from guidellm text results.
|
||||
Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
|
||||
Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
|
||||
"""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def extract_setup_name(filename: str) -> str:
|
||||
"""Extract setup name from filename and format legend appropriately."""
|
||||
basename = os.path.basename(filename)
|
||||
|
||||
# Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
|
||||
match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
|
||||
if match:
|
||||
stack_replicas = match.group(1)
|
||||
workers = match.group(2)
|
||||
vllm_replicas = match.group(3)
|
||||
date = match.group(4)
|
||||
time = match.group(5)
|
||||
return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"
|
||||
|
||||
# Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
|
||||
match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
|
||||
if match:
|
||||
vllm_replicas = match.group(1)
|
||||
date = match.group(2)
|
||||
time = match.group(3)
|
||||
return f"vllm-v{vllm_replicas}"
|
||||
|
||||
# Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
|
||||
match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
|
||||
if match:
|
||||
target = match.group(1)
|
||||
stack_replicas = match.group(2)
|
||||
workers = match.group(3)
|
||||
vllm_replicas = match.group(4)
|
||||
date = match.group(5)
|
||||
time = match.group(6)
|
||||
|
||||
if target == "vllm":
|
||||
return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
|
||||
else:
|
||||
return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"
|
||||
|
||||
# Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
|
||||
match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
|
||||
if match:
|
||||
target = match.group(1)
|
||||
stack_replicas = match.group(2)
|
||||
vllm_replicas = match.group(3)
|
||||
date = match.group(4)
|
||||
time = match.group(5)
|
||||
|
||||
if target == "vllm":
|
||||
return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
|
||||
else:
|
||||
return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"
|
||||
|
||||
return basename.replace("guidellm-benchmark-", "").replace(".txt", "")
|
||||
|
||||
|
||||
def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
|
||||
"""
|
||||
Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
|
||||
Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
|
||||
"""
|
||||
setup_name = extract_setup_name(filepath)
|
||||
data_points = []
|
||||
|
||||
try:
|
||||
with open(filepath) as f:
|
||||
content = f.read()
|
||||
|
||||
# Find the benchmark stats table
|
||||
lines = content.split("\n")
|
||||
in_stats_table = False
|
||||
header_lines_seen = 0
|
||||
|
||||
for line in lines:
|
||||
line_stripped = line.strip()
|
||||
|
||||
# Look for the start of the stats table
|
||||
if "Benchmarks Stats:" in line:
|
||||
in_stats_table = True
|
||||
continue
|
||||
|
||||
if in_stats_table:
|
||||
# Skip the first few separator/header lines
|
||||
if line_stripped.startswith("=") or line_stripped.startswith("-"):
|
||||
header_lines_seen += 1
|
||||
if header_lines_seen >= 3: # After seeing multiple header lines, look for concurrent@ data
|
||||
if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
|
||||
break
|
||||
continue
|
||||
|
||||
# Parse concurrent@ lines in the stats table (may have leading spaces)
|
||||
if in_stats_table and "concurrent@" in line:
|
||||
parts = [part.strip() for part in line.split("|")]
|
||||
|
||||
if len(parts) >= 12: # Make sure we have enough columns for new format
|
||||
try:
|
||||
# Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
|
||||
concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
|
||||
if not concurrent_match:
|
||||
continue
|
||||
concurrency = float(concurrent_match.group(1))
|
||||
|
||||
# Extract metrics from the new table format
|
||||
# From your image, the table has these columns with | separators:
|
||||
# Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
|
||||
# Looking at the mean/median/p99 structure, need to find the mean columns
|
||||
# The structure shows: mean | median | p99 for each metric
|
||||
rps_mean = float(parts[1]) # Per Second (RPS)
|
||||
req_latency_mean = float(parts[6]) * 1000 # Request latency mean (convert from sec to ms)
|
||||
ttft_mean = float(parts[9]) # TTFT mean column
|
||||
itl_mean = float(parts[12]) # ITL mean column
|
||||
|
||||
data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
|
||||
continue
|
||||
|
||||
except (OSError, FileNotFoundError) as e:
|
||||
print(f"Error reading {filepath}: {e}")
|
||||
|
||||
return data_points
|
||||
|
||||
|
||||
def generate_charts(benchmark_dir: str = "results"):
|
||||
"""Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
|
||||
# Find all text result files instead of JSON
|
||||
txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
|
||||
txt_files = glob.glob(txt_pattern)
|
||||
|
||||
if not txt_files:
|
||||
print(f"No text files found matching pattern: {txt_pattern}")
|
||||
return
|
||||
|
||||
print(f"Found {len(txt_files)} text files")
|
||||
|
||||
# Parse all files and collect data
|
||||
all_data = {} # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]
|
||||
|
||||
for txt_file in txt_files:
|
||||
print(f"Processing {txt_file}")
|
||||
data_points = parse_txt_file(txt_file)
|
||||
|
||||
for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
|
||||
if setup_name not in all_data:
|
||||
all_data[setup_name] = []
|
||||
all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))
|
||||
|
||||
if not all_data:
|
||||
print("No data found to plot")
|
||||
return
|
||||
|
||||
# Sort data points by concurrency for each setup
|
||||
for setup_name in all_data:
|
||||
all_data[setup_name].sort(key=lambda x: x[0]) # Sort by concurrency
|
||||
|
||||
# Group setups by vLLM replica number (original approach)
|
||||
replica_groups = {} # vllm_replica_count -> {setup_name: points}
|
||||
|
||||
for setup_name, points in all_data.items():
|
||||
# Extract vLLM replica number from setup name
|
||||
# Expected formats:
|
||||
# - New stack format: "stack-s{X}-sw{W}-v{Y}"
|
||||
# - New vLLM format: "vllm-v{Y}"
|
||||
# - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"
|
||||
|
||||
# Try new formats first
|
||||
vllm_match = re.search(r"-v(\d+)$", setup_name) # Matches both "stack-s1-sw2-v3" and "vllm-v1"
|
||||
if not vllm_match:
|
||||
# Try old stack format
|
||||
vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
|
||||
if not vllm_match:
|
||||
# Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
|
||||
vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)
|
||||
|
||||
if vllm_match:
|
||||
vllm_replica_num = int(vllm_match.group(1))
|
||||
if vllm_replica_num not in replica_groups:
|
||||
replica_groups[vllm_replica_num] = {}
|
||||
replica_groups[vllm_replica_num][setup_name] = points
|
||||
else:
|
||||
print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")
|
||||
|
||||
def create_charts(data_dict, prefix, title_prefix):
|
||||
"""Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
|
||||
if not data_dict:
|
||||
print(f"No data found for {prefix}")
|
||||
return
|
||||
|
||||
# Create 2x2 subplot grid
|
||||
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
|
||||
fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")
|
||||
|
||||
# Collect all unique concurrency values for tick setting
|
||||
all_concurrency_values = set()
|
||||
for points in data_dict.values():
|
||||
all_concurrency_values.update([p[0] for p in points])
|
||||
all_concurrency_values = sorted(all_concurrency_values)
|
||||
|
||||
# Plot data for each setup in alphabetical order
|
||||
for setup_name in sorted(data_dict.keys()):
|
||||
points = data_dict[setup_name]
|
||||
if not points:
|
||||
continue
|
||||
|
||||
concurrency_values = [p[0] for p in points]
|
||||
rps_values = [p[1] for p in points]
|
||||
ttft_values = [p[2] for p in points]
|
||||
itl_values = [p[3] for p in points]
|
||||
req_latency_values = [p[4] for p in points]
|
||||
|
||||
# RPS chart (top-left)
|
||||
ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)
|
||||
|
||||
# Request Latency chart (top-right)
|
||||
ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)
|
||||
|
||||
# TTFT chart (bottom-left)
|
||||
ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)
|
||||
|
||||
# ITL chart (bottom-right)
|
||||
ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)
|
||||
|
||||
# Configure all charts after plotting data
|
||||
axes = [ax1, ax2, ax3, ax4]
|
||||
titles = ["RPS", "Request Latency", "TTFT", "ITL"]
|
||||
ylabels = [
|
||||
"Requests Per Second (RPS)",
|
||||
"Request Latency (ms)",
|
||||
"Time to First Token (ms)",
|
||||
"Inter Token Latency (ms)",
|
||||
]
|
||||
|
||||
for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
|
||||
ax.set_xlabel("Concurrency", fontsize=12)
|
||||
ax.set_ylabel(ylabel, fontsize=12)
|
||||
ax.set_title(title, fontsize=14, fontweight="bold")
|
||||
ax.set_xscale("log", base=2)
|
||||
ax.set_xticks(all_concurrency_values)
|
||||
ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
# Add legend to the right-most subplot (top-right)
|
||||
ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save the combined chart
|
||||
combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
|
||||
plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
|
||||
plt.close()
|
||||
print(f"Combined benchmark chart saved to {combined_filename}")
|
||||
|
||||
# Print grouping information
|
||||
for replica_count, data_dict in replica_groups.items():
|
||||
print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")
|
||||
|
||||
# Create separate charts for each replica group
|
||||
for replica_count, data_dict in replica_groups.items():
|
||||
prefix = f"vllm_replica{replica_count}"
|
||||
title = f"vLLM Replicas={replica_count}"
|
||||
create_charts(data_dict, prefix, title)
|
||||
|
||||
# Print summary
|
||||
print("\nSummary:")
|
||||
for setup_name, points in all_data.items():
|
||||
print(f"{setup_name}: {len(points)} data points")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_charts()
|
Loading…
Add table
Add a link
Reference in a new issue