chore(perf): run guidellm benchmarks (#3421)

# What does this PR do? - Mostly AI-generated scripts to run guidellm (https://github.com/vllm-project/guidellm) benchmarks on k8s setup - Stack is using image built from main on 9/11 ## Test Plan See updated README.md
2025-12-03 09:53:45 +00:00 · 2025-09-24 10:18:33 -07:00 · 2025-09-24 10:18:33 -07:00 · 48a551ecbc
commit 48a551ecbc
parent 2f58d87c22
14 changed files with 1436 additions and 526 deletions
--- a/benchmarking/k8s-benchmark/scripts/generate_charts.py
+++ b/benchmarking/k8s-benchmark/scripts/generate_charts.py
@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# /// script
+# dependencies = [
+#   "matplotlib",
+# ]
+# ///
+"""
+Script to generate benchmark charts from guidellm text results.
+Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
+Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
+"""
+
+import glob
+import os
+import re
+
+import matplotlib.pyplot as plt
+
+
+def extract_setup_name(filename: str) -> str:
+    """Extract setup name from filename and format legend appropriately."""
+    basename = os.path.basename(filename)
+
+    # Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
+    if match:
+        stack_replicas = match.group(1)
+        workers = match.group(2)
+        vllm_replicas = match.group(3)
+        date = match.group(4)
+        time = match.group(5)
+        return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"
+
+    # Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
+    if match:
+        vllm_replicas = match.group(1)
+        date = match.group(2)
+        time = match.group(3)
+        return f"vllm-v{vllm_replicas}"
+
+    # Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
+    if match:
+        target = match.group(1)
+        stack_replicas = match.group(2)
+        workers = match.group(3)
+        vllm_replicas = match.group(4)
+        date = match.group(5)
+        time = match.group(6)
+
+        if target == "vllm":
+            return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
+        else:
+            return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"
+
+    # Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
+    if match:
+        target = match.group(1)
+        stack_replicas = match.group(2)
+        vllm_replicas = match.group(3)
+        date = match.group(4)
+        time = match.group(5)
+
+        if target == "vllm":
+            return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
+        else:
+            return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"
+
+    return basename.replace("guidellm-benchmark-", "").replace(".txt", "")
+
+
+def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
+    """
+    Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
+    Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
+    """
+    setup_name = extract_setup_name(filepath)
+    data_points = []
+
+    try:
+        with open(filepath) as f:
+            content = f.read()
+
+        # Find the benchmark stats table
+        lines = content.split("\n")
+        in_stats_table = False
+        header_lines_seen = 0
+
+        for line in lines:
+            line_stripped = line.strip()
+
+            # Look for the start of the stats table
+            if "Benchmarks Stats:" in line:
+                in_stats_table = True
+                continue
+
+            if in_stats_table:
+                # Skip the first few separator/header lines
+                if line_stripped.startswith("=") or line_stripped.startswith("-"):
+                    header_lines_seen += 1
+                    if header_lines_seen >= 3:  # After seeing multiple header lines, look for concurrent@ data
+                        if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
+                            break
+                    continue
+
+            # Parse concurrent@ lines in the stats table (may have leading spaces)
+            if in_stats_table and "concurrent@" in line:
+                parts = [part.strip() for part in line.split("|")]
+
+                if len(parts) >= 12:  # Make sure we have enough columns for new format
+                    try:
+                        # Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
+                        concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
+                        if not concurrent_match:
+                            continue
+                        concurrency = float(concurrent_match.group(1))
+
+                        # Extract metrics from the new table format
+                        # From your image, the table has these columns with | separators:
+                        # Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
+                        # Looking at the mean/median/p99 structure, need to find the mean columns
+                        # The structure shows: mean | median | p99 for each metric
+                        rps_mean = float(parts[1])  # Per Second (RPS)
+                        req_latency_mean = float(parts[6]) * 1000  # Request latency mean (convert from sec to ms)
+                        ttft_mean = float(parts[9])  # TTFT mean column
+                        itl_mean = float(parts[12])  # ITL mean column
+
+                        data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))
+
+                    except (ValueError, IndexError) as e:
+                        print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
+                        continue
+
+    except (OSError, FileNotFoundError) as e:
+        print(f"Error reading {filepath}: {e}")
+
+    return data_points
+
+
+def generate_charts(benchmark_dir: str = "results"):
+    """Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
+    # Find all text result files instead of JSON
+    txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
+    txt_files = glob.glob(txt_pattern)
+
+    if not txt_files:
+        print(f"No text files found matching pattern: {txt_pattern}")
+        return
+
+    print(f"Found {len(txt_files)} text files")
+
+    # Parse all files and collect data
+    all_data = {}  # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]
+
+    for txt_file in txt_files:
+        print(f"Processing {txt_file}")
+        data_points = parse_txt_file(txt_file)
+
+        for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
+            if setup_name not in all_data:
+                all_data[setup_name] = []
+            all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))
+
+    if not all_data:
+        print("No data found to plot")
+        return
+
+    # Sort data points by concurrency for each setup
+    for setup_name in all_data:
+        all_data[setup_name].sort(key=lambda x: x[0])  # Sort by concurrency
+
+    # Group setups by vLLM replica number (original approach)
+    replica_groups = {}  # vllm_replica_count -> {setup_name: points}
+
+    for setup_name, points in all_data.items():
+        # Extract vLLM replica number from setup name
+        # Expected formats:
+        # - New stack format: "stack-s{X}-sw{W}-v{Y}"
+        # - New vLLM format: "vllm-v{Y}"
+        # - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"
+
+        # Try new formats first
+        vllm_match = re.search(r"-v(\d+)$", setup_name)  # Matches both "stack-s1-sw2-v3" and "vllm-v1"
+        if not vllm_match:
+            # Try old stack format
+            vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
+        if not vllm_match:
+            # Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
+            vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)
+
+        if vllm_match:
+            vllm_replica_num = int(vllm_match.group(1))
+            if vllm_replica_num not in replica_groups:
+                replica_groups[vllm_replica_num] = {}
+            replica_groups[vllm_replica_num][setup_name] = points
+        else:
+            print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")
+
+    def create_charts(data_dict, prefix, title_prefix):
+        """Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
+        if not data_dict:
+            print(f"No data found for {prefix}")
+            return
+
+        # Create 2x2 subplot grid
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
+        fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")
+
+        # Collect all unique concurrency values for tick setting
+        all_concurrency_values = set()
+        for points in data_dict.values():
+            all_concurrency_values.update([p[0] for p in points])
+        all_concurrency_values = sorted(all_concurrency_values)
+
+        # Plot data for each setup in alphabetical order
+        for setup_name in sorted(data_dict.keys()):
+            points = data_dict[setup_name]
+            if not points:
+                continue
+
+            concurrency_values = [p[0] for p in points]
+            rps_values = [p[1] for p in points]
+            ttft_values = [p[2] for p in points]
+            itl_values = [p[3] for p in points]
+            req_latency_values = [p[4] for p in points]
+
+            # RPS chart (top-left)
+            ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # Request Latency chart (top-right)
+            ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # TTFT chart (bottom-left)
+            ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # ITL chart (bottom-right)
+            ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+        # Configure all charts after plotting data
+        axes = [ax1, ax2, ax3, ax4]
+        titles = ["RPS", "Request Latency", "TTFT", "ITL"]
+        ylabels = [
+            "Requests Per Second (RPS)",
+            "Request Latency (ms)",
+            "Time to First Token (ms)",
+            "Inter Token Latency (ms)",
+        ]
+
+        for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
+            ax.set_xlabel("Concurrency", fontsize=12)
+            ax.set_ylabel(ylabel, fontsize=12)
+            ax.set_title(title, fontsize=14, fontweight="bold")
+            ax.set_xscale("log", base=2)
+            ax.set_xticks(all_concurrency_values)
+            ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
+            ax.grid(True, alpha=0.3)
+
+        # Add legend to the right-most subplot (top-right)
+        ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
+
+        plt.tight_layout()
+
+        # Save the combined chart
+        combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
+        plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
+        plt.close()
+        print(f"Combined benchmark chart saved to {combined_filename}")
+
+    # Print grouping information
+    for replica_count, data_dict in replica_groups.items():
+        print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")
+
+    # Create separate charts for each replica group
+    for replica_count, data_dict in replica_groups.items():
+        prefix = f"vllm_replica{replica_count}"
+        title = f"vLLM Replicas={replica_count}"
+        create_charts(data_dict, prefix, title)
+
+    # Print summary
+    print("\nSummary:")
+    for setup_name, points in all_data.items():
+        print(f"{setup_name}: {len(points)} data points")
+
+
+if __name__ == "__main__":
+    generate_charts()