llama-stack-mirror/benchmarking/k8s-benchmark/scripts/generate_charts.py

#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

# /// script
# dependencies = [
#   "matplotlib",
# ]
# ///
"""
Script to generate benchmark charts from guidellm text results.
Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
"""

import glob
import os
import re

import matplotlib.pyplot as plt


def extract_setup_name(filename: str) -> str:
    """Extract setup name from filename and format legend appropriately."""
    basename = os.path.basename(filename)

    # Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
    if match:
        stack_replicas = match.group(1)
        workers = match.group(2)
        vllm_replicas = match.group(3)
        date = match.group(4)
        time = match.group(5)
        return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"

    # Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
    if match:
        vllm_replicas = match.group(1)
        date = match.group(2)
        time = match.group(3)
        return f"vllm-v{vllm_replicas}"

    # Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
    if match:
        target = match.group(1)
        stack_replicas = match.group(2)
        workers = match.group(3)
        vllm_replicas = match.group(4)
        date = match.group(5)
        time = match.group(6)

        if target == "vllm":
            return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
        else:
            return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"

    # Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
    if match:
        target = match.group(1)
        stack_replicas = match.group(2)
        vllm_replicas = match.group(3)
        date = match.group(4)
        time = match.group(5)

        if target == "vllm":
            return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
        else:
            return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"

    return basename.replace("guidellm-benchmark-", "").replace(".txt", "")


def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
    """
    Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
    Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
    """
    setup_name = extract_setup_name(filepath)
    data_points = []

    try:
        with open(filepath) as f:
            content = f.read()

        # Find the benchmark stats table
        lines = content.split("\n")
        in_stats_table = False
        header_lines_seen = 0

        for line in lines:
            line_stripped = line.strip()

            # Look for the start of the stats table
            if "Benchmarks Stats:" in line:
                in_stats_table = True
                continue

            if in_stats_table:
                # Skip the first few separator/header lines
                if line_stripped.startswith("=") or line_stripped.startswith("-"):
                    header_lines_seen += 1
                    if header_lines_seen >= 3:  # After seeing multiple header lines, look for concurrent@ data
                        if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
                            break
                    continue

            # Parse concurrent@ lines in the stats table (may have leading spaces)
            if in_stats_table and "concurrent@" in line:
                parts = [part.strip() for part in line.split("|")]

                if len(parts) >= 12:  # Make sure we have enough columns for new format
                    try:
                        # Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
                        concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
                        if not concurrent_match:
                            continue
                        concurrency = float(concurrent_match.group(1))

                        # Extract metrics from the new table format
                        # From your image, the table has these columns with | separators:
                        # Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
                        # Looking at the mean/median/p99 structure, need to find the mean columns
                        # The structure shows: mean | median | p99 for each metric
                        rps_mean = float(parts[1])  # Per Second (RPS)
                        req_latency_mean = float(parts[6]) * 1000  # Request latency mean (convert from sec to ms)
                        ttft_mean = float(parts[9])  # TTFT mean column
                        itl_mean = float(parts[12])  # ITL mean column

                        data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))

                    except (ValueError, IndexError) as e:
                        print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
                        continue

    except (OSError, FileNotFoundError) as e:
        print(f"Error reading {filepath}: {e}")

    return data_points


def generate_charts(benchmark_dir: str = "results"):
    """Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
    # Find all text result files instead of JSON
    txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
    txt_files = glob.glob(txt_pattern)

    if not txt_files:
        print(f"No text files found matching pattern: {txt_pattern}")
        return

    print(f"Found {len(txt_files)} text files")

    # Parse all files and collect data
    all_data = {}  # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]

    for txt_file in txt_files:
        print(f"Processing {txt_file}")
        data_points = parse_txt_file(txt_file)

        for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
            if setup_name not in all_data:
                all_data[setup_name] = []
            all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))

    if not all_data:
        print("No data found to plot")
        return

    # Sort data points by concurrency for each setup
    for setup_name in all_data:
        all_data[setup_name].sort(key=lambda x: x[0])  # Sort by concurrency

    # Group setups by vLLM replica number (original approach)
    replica_groups = {}  # vllm_replica_count -> {setup_name: points}

    for setup_name, points in all_data.items():
        # Extract vLLM replica number from setup name
        # Expected formats:
        # - New stack format: "stack-s{X}-sw{W}-v{Y}"
        # - New vLLM format: "vllm-v{Y}"
        # - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"

        # Try new formats first
        vllm_match = re.search(r"-v(\d+)$", setup_name)  # Matches both "stack-s1-sw2-v3" and "vllm-v1"
        if not vllm_match:
            # Try old stack format
            vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
        if not vllm_match:
            # Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
            vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)

        if vllm_match:
            vllm_replica_num = int(vllm_match.group(1))
            if vllm_replica_num not in replica_groups:
                replica_groups[vllm_replica_num] = {}
            replica_groups[vllm_replica_num][setup_name] = points
        else:
            print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")

    def create_charts(data_dict, prefix, title_prefix):
        """Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
        if not data_dict:
            print(f"No data found for {prefix}")
            return

        # Create 2x2 subplot grid
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")

        # Collect all unique concurrency values for tick setting
        all_concurrency_values = set()
        for points in data_dict.values():
            all_concurrency_values.update([p[0] for p in points])
        all_concurrency_values = sorted(all_concurrency_values)

        # Plot data for each setup in alphabetical order
        for setup_name in sorted(data_dict.keys()):
            points = data_dict[setup_name]
            if not points:
                continue

            concurrency_values = [p[0] for p in points]
            rps_values = [p[1] for p in points]
            ttft_values = [p[2] for p in points]
            itl_values = [p[3] for p in points]
            req_latency_values = [p[4] for p in points]

            # RPS chart (top-left)
            ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)

            # Request Latency chart (top-right)
            ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)

            # TTFT chart (bottom-left)
            ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)

            # ITL chart (bottom-right)
            ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)

        # Configure all charts after plotting data
        axes = [ax1, ax2, ax3, ax4]
        titles = ["RPS", "Request Latency", "TTFT", "ITL"]
        ylabels = [
            "Requests Per Second (RPS)",
            "Request Latency (ms)",
            "Time to First Token (ms)",
            "Inter Token Latency (ms)",
        ]

        for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
            ax.set_xlabel("Concurrency", fontsize=12)
            ax.set_ylabel(ylabel, fontsize=12)
            ax.set_title(title, fontsize=14, fontweight="bold")
            ax.set_xscale("log", base=2)
            ax.set_xticks(all_concurrency_values)
            ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
            ax.grid(True, alpha=0.3)

        # Add legend to the right-most subplot (top-right)
        ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")

        plt.tight_layout()

        # Save the combined chart
        combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
        plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
        plt.close()
        print(f"Combined benchmark chart saved to {combined_filename}")

    # Print grouping information
    for replica_count, data_dict in replica_groups.items():
        print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")

    # Create separate charts for each replica group
    for replica_count, data_dict in replica_groups.items():
        prefix = f"vllm_replica{replica_count}"
        title = f"vLLM Replicas={replica_count}"
        create_charts(data_dict, prefix, title)

    # Print summary
    print("\nSummary:")
    for setup_name, points in all_data.items():
        print(f"{setup_name}: {len(points)} data points")


if __name__ == "__main__":
    generate_charts()