chore(perf): run guidellm benchmarks (#3421)

# What does this PR do? - Mostly AI-generated scripts to run guidellm (https://github.com/vllm-project/guidellm) benchmarks on k8s setup - Stack is using image built from main on 9/11 ## Test Plan See updated README.md
2025-12-03 09:53:45 +00:00 · 2025-09-24 10:18:33 -07:00 · 2025-09-24 10:18:33 -07:00 · 48a551ecbc
commit 48a551ecbc
parent 2f58d87c22
14 changed files with 1436 additions and 526 deletions
--- a/benchmarking/k8s-benchmark/scripts/generate_charts.py
+++ b/benchmarking/k8s-benchmark/scripts/generate_charts.py
@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# /// script
+# dependencies = [
+#   "matplotlib",
+# ]
+# ///
+"""
+Script to generate benchmark charts from guidellm text results.
+Creates 2x2 grid charts with RPS, Request Latency, TTFT, and ITL metrics against concurrent@x values.
+Outputs one chart file per vLLM replica group, with each line representing one benchmark run.
+"""
+
+import glob
+import os
+import re
+
+import matplotlib.pyplot as plt
+
+
+def extract_setup_name(filename: str) -> str:
+    """Extract setup name from filename and format legend appropriately."""
+    basename = os.path.basename(filename)
+
+    # Try new pattern: guidellm-benchmark-stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-stack-s(\d+)-sw(\d+)-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
+    if match:
+        stack_replicas = match.group(1)
+        workers = match.group(2)
+        vllm_replicas = match.group(3)
+        date = match.group(4)
+        time = match.group(5)
+        return f"stack-s{stack_replicas}-sw{workers}-v{vllm_replicas}"
+
+    # Try new vLLM pattern: guidellm-benchmark-vllm-v{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-vllm-v(\d+)-(\d{8})-(\d{6})\.txt", basename)
+    if match:
+        vllm_replicas = match.group(1)
+        date = match.group(2)
+        time = match.group(3)
+        return f"vllm-v{vllm_replicas}"
+
+    # Fall back to old pattern: guidellm-benchmark-{target}-{stack_replicas}-w{workers}-{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-w(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
+    if match:
+        target = match.group(1)
+        stack_replicas = match.group(2)
+        workers = match.group(3)
+        vllm_replicas = match.group(4)
+        date = match.group(5)
+        time = match.group(6)
+
+        if target == "vllm":
+            return f"vllm-{vllm_replicas}-w{workers}-{vllm_replicas}"
+        else:
+            return f"stack-replicas{stack_replicas}-w{workers}-vllm-replicas{vllm_replicas}-{date}-{time}"
+
+    # Fall back to older pattern: guidellm-benchmark-{target}-{stack_replicas}-{vllm_replicas}-{timestamp}.txt
+    match = re.search(r"guidellm-benchmark-([^-]+)-(\d+)-(\d+)-(\d+)-(\d+)\.txt", basename)
+    if match:
+        target = match.group(1)
+        stack_replicas = match.group(2)
+        vllm_replicas = match.group(3)
+        date = match.group(4)
+        time = match.group(5)
+
+        if target == "vllm":
+            return f"vllm-{vllm_replicas}-w1-{vllm_replicas}"
+        else:
+            return f"stack-replicas{stack_replicas}-vllm-replicas{vllm_replicas}-{date}-{time}"
+
+    return basename.replace("guidellm-benchmark-", "").replace(".txt", "")
+
+
+def parse_txt_file(filepath: str) -> list[tuple[float, float, float, float, float, str]]:
+    """
+    Parse a text benchmark file and extract concurrent@x, RPS, TTFT, ITL, and request latency data.
+    Returns list of (concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name) tuples.
+    """
+    setup_name = extract_setup_name(filepath)
+    data_points = []
+
+    try:
+        with open(filepath) as f:
+            content = f.read()
+
+        # Find the benchmark stats table
+        lines = content.split("\n")
+        in_stats_table = False
+        header_lines_seen = 0
+
+        for line in lines:
+            line_stripped = line.strip()
+
+            # Look for the start of the stats table
+            if "Benchmarks Stats:" in line:
+                in_stats_table = True
+                continue
+
+            if in_stats_table:
+                # Skip the first few separator/header lines
+                if line_stripped.startswith("=") or line_stripped.startswith("-"):
+                    header_lines_seen += 1
+                    if header_lines_seen >= 3:  # After seeing multiple header lines, look for concurrent@ data
+                        if line_stripped.startswith("=") and "concurrent@" not in line_stripped:
+                            break
+                    continue
+
+            # Parse concurrent@ lines in the stats table (may have leading spaces)
+            if in_stats_table and "concurrent@" in line:
+                parts = [part.strip() for part in line.split("|")]
+
+                if len(parts) >= 12:  # Make sure we have enough columns for new format
+                    try:
+                        # Extract concurrency from benchmark name (e.g., concurrent@1 -> 1)
+                        concurrent_match = re.search(r"concurrent@(\d+)", parts[0])
+                        if not concurrent_match:
+                            continue
+                        concurrency = float(concurrent_match.group(1))
+
+                        # Extract metrics from the new table format
+                        # From your image, the table has these columns with | separators:
+                        # Benchmark | Per Second | Concurrency | Out Tok/sec | Tot Tok/sec | Req Latency (sec) | TTFT (ms) | ITL (ms) | TPOT (ms)
+                        # Looking at the mean/median/p99 structure, need to find the mean columns
+                        # The structure shows: mean | median | p99 for each metric
+                        rps_mean = float(parts[1])  # Per Second (RPS)
+                        req_latency_mean = float(parts[6]) * 1000  # Request latency mean (convert from sec to ms)
+                        ttft_mean = float(parts[9])  # TTFT mean column
+                        itl_mean = float(parts[12])  # ITL mean column
+
+                        data_points.append((concurrency, rps_mean, ttft_mean, itl_mean, req_latency_mean, setup_name))
+
+                    except (ValueError, IndexError) as e:
+                        print(f"Warning: Could not parse line '{line}' in {filepath}: {e}")
+                        continue
+
+    except (OSError, FileNotFoundError) as e:
+        print(f"Error reading {filepath}: {e}")
+
+    return data_points
+
+
+def generate_charts(benchmark_dir: str = "results"):
+    """Generate 2x2 grid charts (RPS, Request Latency, TTFT, ITL) from benchmark text files."""
+    # Find all text result files instead of JSON
+    txt_pattern = os.path.join(benchmark_dir, "guidellm-benchmark-*.txt")
+    txt_files = glob.glob(txt_pattern)
+
+    if not txt_files:
+        print(f"No text files found matching pattern: {txt_pattern}")
+        return
+
+    print(f"Found {len(txt_files)} text files")
+
+    # Parse all files and collect data
+    all_data = {}  # setup_name -> [(concurrency, rps, ttft, itl, req_latency), ...]
+
+    for txt_file in txt_files:
+        print(f"Processing {txt_file}")
+        data_points = parse_txt_file(txt_file)
+
+        for concurrency, rps, ttft, itl, req_latency, setup_name in data_points:
+            if setup_name not in all_data:
+                all_data[setup_name] = []
+            all_data[setup_name].append((concurrency, rps, ttft, itl, req_latency))
+
+    if not all_data:
+        print("No data found to plot")
+        return
+
+    # Sort data points by concurrency for each setup
+    for setup_name in all_data:
+        all_data[setup_name].sort(key=lambda x: x[0])  # Sort by concurrency
+
+    # Group setups by vLLM replica number (original approach)
+    replica_groups = {}  # vllm_replica_count -> {setup_name: points}
+
+    for setup_name, points in all_data.items():
+        # Extract vLLM replica number from setup name
+        # Expected formats:
+        # - New stack format: "stack-s{X}-sw{W}-v{Y}"
+        # - New vLLM format: "vllm-v{Y}"
+        # - Old formats: "stack-replicas{X}-w{W}-vllm-replicas{Y}" or "vllm-{Y}-w{W}-{Y}"
+
+        # Try new formats first
+        vllm_match = re.search(r"-v(\d+)$", setup_name)  # Matches both "stack-s1-sw2-v3" and "vllm-v1"
+        if not vllm_match:
+            # Try old stack format
+            vllm_match = re.search(r"vllm-replicas(\d+)", setup_name)
+        if not vllm_match:
+            # Try old vLLM format: "vllm-{Y}-w{W}-{Y}"
+            vllm_match = re.search(r"vllm-(\d+)-w\d+-\d+", setup_name)
+
+        if vllm_match:
+            vllm_replica_num = int(vllm_match.group(1))
+            if vllm_replica_num not in replica_groups:
+                replica_groups[vllm_replica_num] = {}
+            replica_groups[vllm_replica_num][setup_name] = points
+        else:
+            print(f"Warning: Could not extract vLLM replica count from setup name: {setup_name}")
+
+    def create_charts(data_dict, prefix, title_prefix):
+        """Create a 2x2 grid with RPS, Request Latency, TTFT, and ITL charts."""
+        if not data_dict:
+            print(f"No data found for {prefix}")
+            return
+
+        # Create 2x2 subplot grid
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
+        fig.suptitle(f"{title_prefix} Benchmark Results", fontsize=16, fontweight="bold")
+
+        # Collect all unique concurrency values for tick setting
+        all_concurrency_values = set()
+        for points in data_dict.values():
+            all_concurrency_values.update([p[0] for p in points])
+        all_concurrency_values = sorted(all_concurrency_values)
+
+        # Plot data for each setup in alphabetical order
+        for setup_name in sorted(data_dict.keys()):
+            points = data_dict[setup_name]
+            if not points:
+                continue
+
+            concurrency_values = [p[0] for p in points]
+            rps_values = [p[1] for p in points]
+            ttft_values = [p[2] for p in points]
+            itl_values = [p[3] for p in points]
+            req_latency_values = [p[4] for p in points]
+
+            # RPS chart (top-left)
+            ax1.plot(concurrency_values, rps_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # Request Latency chart (top-right)
+            ax2.plot(concurrency_values, req_latency_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # TTFT chart (bottom-left)
+            ax3.plot(concurrency_values, ttft_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+            # ITL chart (bottom-right)
+            ax4.plot(concurrency_values, itl_values, marker="o", label=setup_name, linewidth=2, markersize=6)
+
+        # Configure all charts after plotting data
+        axes = [ax1, ax2, ax3, ax4]
+        titles = ["RPS", "Request Latency", "TTFT", "ITL"]
+        ylabels = [
+            "Requests Per Second (RPS)",
+            "Request Latency (ms)",
+            "Time to First Token (ms)",
+            "Inter Token Latency (ms)",
+        ]
+
+        for ax, title, ylabel in zip(axes, titles, ylabels, strict=False):
+            ax.set_xlabel("Concurrency", fontsize=12)
+            ax.set_ylabel(ylabel, fontsize=12)
+            ax.set_title(title, fontsize=14, fontweight="bold")
+            ax.set_xscale("log", base=2)
+            ax.set_xticks(all_concurrency_values)
+            ax.set_xticklabels([str(int(x)) for x in all_concurrency_values])
+            ax.grid(True, alpha=0.3)
+
+        # Add legend to the right-most subplot (top-right)
+        ax2.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
+
+        plt.tight_layout()
+
+        # Save the combined chart
+        combined_filename = os.path.join(benchmark_dir, f"{prefix}_benchmark_results.png")
+        plt.savefig(combined_filename, dpi=300, bbox_inches="tight")
+        plt.close()
+        print(f"Combined benchmark chart saved to {combined_filename}")
+
+    # Print grouping information
+    for replica_count, data_dict in replica_groups.items():
+        print(f"vLLM Replica {replica_count} setups: {list(data_dict.keys())}")
+
+    # Create separate charts for each replica group
+    for replica_count, data_dict in replica_groups.items():
+        prefix = f"vllm_replica{replica_count}"
+        title = f"vLLM Replicas={replica_count}"
+        create_charts(data_dict, prefix, title)
+
+    # Print summary
+    print("\nSummary:")
+    for setup_name, points in all_data.items():
+        print(f"{setup_name}: {len(points)} data points")
+
+
+if __name__ == "__main__":
+    generate_charts()
--- a/benchmarking/k8s-benchmark/scripts/run-all-benchmarks.sh
+++ b/benchmarking/k8s-benchmark/scripts/run-all-benchmarks.sh
@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Define benchmark configurations: (target, stack_replicas, vllm_replicas, stack_workers)
+configs=(
+    "stack 1 1 1"
+    "stack 1 1 2"
+    "stack 1 1 4"
+    "vllm 1 1 -"
+)
+
+set -euo pipefail
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Running comprehensive GuideLL benchmark suite..."
+echo "Start time: $(date)"
+
+# Default deployment names
+STACK_DEPLOYMENT="llama-stack-benchmark-server"
+VLLM_DEPLOYMENT="vllm-server"
+
+# Scaling function
+scale_deployments() {
+    local stack_replicas=$1
+    local vllm_replicas=$2
+    local workers=$3
+
+    echo "Scaling deployments..."
+
+    if [[ "$vllm_replicas" != "-" ]]; then
+        echo "Scaling $VLLM_DEPLOYMENT to $vllm_replicas replicas..."
+        kubectl scale deployment $VLLM_DEPLOYMENT --replicas=$vllm_replicas
+        kubectl rollout status deployment $VLLM_DEPLOYMENT --timeout=600s
+    fi
+
+    if [[ "$target" == "stack" ]]; then
+        if [[ "$stack_replicas" != "-" ]]; then
+            echo "Scaling $STACK_DEPLOYMENT to $stack_replicas replicas..."
+            kubectl scale deployment $STACK_DEPLOYMENT --replicas=$stack_replicas
+            kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
+        fi
+
+        if [[ "$workers" != "-" ]]; then
+            echo "Updating $STACK_DEPLOYMENT to use $workers workers..."
+            kubectl set env deployment/$STACK_DEPLOYMENT LLAMA_STACK_WORKERS=$workers
+            kubectl rollout status deployment $STACK_DEPLOYMENT --timeout=600s
+        fi
+    fi
+
+    echo "All scaling operations completed. Waiting additional 30s for services to stabilize..."
+    sleep 30
+}
+
+
+for config in "${configs[@]}"; do
+    read -r target stack_replicas vllm_replicas workers <<< "$config"
+
+    echo ""
+    echo "=========================================="
+    if [[ "$workers" != "-" ]]; then
+        echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas, workers=$workers)"
+    else
+        echo "Running benchmark: $target (stack=$stack_replicas, vllm=$vllm_replicas)"
+    fi
+    echo "Start: $(date)"
+    echo "=========================================="
+
+    # Scale deployments before running benchmark
+    scale_deployments "$stack_replicas" "$vllm_replicas" "$workers"
+
+    # Generate output filename with setup info
+    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+    if [[ "$target" == "stack" ]]; then
+        OUTPUT_FILE="results/guidellm-benchmark-${target}-s${stack_replicas}-sw${workers}-v${vllm_replicas}-${TIMESTAMP}.txt"
+    else
+        OUTPUT_FILE="results/guidellm-benchmark-${target}-v${vllm_replicas}-${TIMESTAMP}.txt"
+    fi
+
+    # Run the benchmark with the cluster as configured
+    "$SCRIPT_DIR/run-guidellm-benchmark.sh" \
+        --target "$target" \
+        --output-file "$OUTPUT_FILE"
+
+    echo "Completed: $(date)"
+    echo "Waiting 30 seconds before next benchmark..."
+    sleep 30
+done
+
+echo ""
+echo "=========================================="
+echo "All benchmarks completed!"
+echo "End time: $(date)"
+echo "=========================================="
+echo ""
+echo "Results files generated:"
+ls -la results/guidellm-*.txt results/guidellm-*.json 2>/dev/null || echo "No result files found"
--- a/benchmarking/k8s-benchmark/scripts/run-guidellm-benchmark.sh
+++ b/benchmarking/k8s-benchmark/scripts/run-guidellm-benchmark.sh
@ -0,0 +1,219 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+# Default values
+TARGET="stack"
+MAX_SECONDS=60
+PROMPT_TOKENS=512
+OUTPUT_TOKENS=256
+RATE_TYPE="concurrent"
+RATE="1,2,4,8,16,32,64,128"
+STACK_DEPLOYMENT="llama-stack-benchmark-server"
+STACK_URL="http://llama-stack-benchmark-service:8323/v1/openai"
+VLLM_DEPLOYMENT="vllm-server"
+OUTPUT_FILE=""
+
+# Parse command line arguments
+usage() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  -t, --target <stack|vllm>     Target to benchmark (default: stack)"
+    echo "  -s, --max-seconds <seconds>   Maximum duration in seconds (default: 60)"
+    echo "  -p, --prompt-tokens <tokens>  Number of prompt tokens (default: 512)"
+    echo "  -o, --output-tokens <tokens>  Number of output tokens (default: 256)"
+    echo "  -r, --rate-type <type>        Rate type (default: concurrent)"
+    echo "  -c, --rate                    Rate (default: 1,2,4,8,16,32,64,128)"
+    echo "  --output-file <path>          Output file path (default: auto-generated)"
+    echo "  --stack-deployment <name>     Name of the stack deployment (default: llama-stack-benchmark-server)"
+    echo "  --vllm-deployment <name>      Name of the vllm deployment (default: vllm-server)"
+    echo "  --stack-url <url>             URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)"
+    echo "  -h, --help                    Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --target vllm                              # Benchmark vLLM direct"
+    echo "  $0 --target stack                             # Benchmark Llama Stack (default)"
+    echo "  $0 -t vllm -s 60 -p 512 -o 256               # vLLM with custom parameters"
+    echo "  $0 --output-file results/my-benchmark.txt     # Specify custom output file"
+    echo "  $0 --stack-deployment my-stack-server         # Use custom stack deployment name"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -t|--target)
+            TARGET="$2"
+            shift 2
+            ;;
+        -s|--max-seconds)
+            MAX_SECONDS="$2"
+            shift 2
+            ;;
+        -p|--prompt-tokens)
+            PROMPT_TOKENS="$2"
+            shift 2
+            ;;
+        -o|--output-tokens)
+            OUTPUT_TOKENS="$2"
+            shift 2
+            ;;
+        -r|--rate-type)
+            RATE_TYPE="$2"
+            shift 2
+            ;;
+        -c|--rate)
+            RATE="$2"
+            shift 2
+            ;;
+        --output-file)
+            OUTPUT_FILE="$2"
+            shift 2
+            ;;
+        --stack-deployment)
+            STACK_DEPLOYMENT="$2"
+            shift 2
+            ;;
+        --vllm-deployment)
+            VLLM_DEPLOYMENT="$2"
+            shift 2
+            ;;
+        --stack-url)
+            STACK_URL="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Validate target
+if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
+    echo "Error: Target must be 'stack' or 'vllm'"
+    usage
+    exit 1
+fi
+
+# Set configuration based on target
+if [[ "$TARGET" == "vllm" ]]; then
+    BASE_URL="http://${VLLM_DEPLOYMENT}:8000"
+    JOB_NAME="guidellm-vllm-benchmark-job"
+    echo "Benchmarking vLLM direct with GuideLLM..."
+else
+    BASE_URL="$STACK_URL"
+    JOB_NAME="guidellm-stack-benchmark-job"
+    echo "Benchmarking Llama Stack with GuideLLM..."
+fi
+
+
+echo "Configuration:"
+echo "  Target: $TARGET"
+echo "  Base URL: $BASE_URL"
+echo "  Max seconds: ${MAX_SECONDS}s"
+echo "  Prompt tokens: $PROMPT_TOKENS"
+echo "  Output tokens: $OUTPUT_TOKENS"
+echo "  Rate type: $RATE_TYPE"
+if [[ "$TARGET" == "vllm" ]]; then
+    echo "  vLLM deployment: $VLLM_DEPLOYMENT"
+else
+    echo "  Stack deployment: $STACK_DEPLOYMENT"
+fi
+echo ""
+
+# Create temporary job yaml
+TEMP_YAML="/tmp/guidellm-benchmark-job-temp-$(date +%s).yaml"
+cat > "$TEMP_YAML" << EOF
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: $JOB_NAME
+  namespace: default
+spec:
+  template:
+    spec:
+      containers:
+      - name: guidellm-benchmark
+        image: python:3.11-slim
+        command: ["/bin/bash"]
+        args:
+        - "-c"
+        - |
+          # Install uv and guidellm
+          pip install uv &&
+          uv pip install --system guidellm &&
+
+          # Login to HuggingFace
+          uv pip install --system huggingface_hub &&
+          python -c "from huggingface_hub import login; login(token='\$HF_TOKEN')" &&
+
+          # Run GuideLLM benchmark and save output
+          export COLUMNS=200
+          GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \\
+            --target "$BASE_URL" \\
+            --rate-type "$RATE_TYPE" \\
+            --max-seconds $MAX_SECONDS \\
+            --data "prompt_tokens=$PROMPT_TOKENS,output_tokens=$OUTPUT_TOKENS" \\
+            --model "$INFERENCE_MODEL" \\
+            --rate "$RATE" \\
+            --warmup-percent 0.05 \\
+            2>&1
+        env:
+        - name: INFERENCE_MODEL
+          value: "meta-llama/Llama-3.2-3B-Instruct"
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        resources:
+          requests:
+            memory: "4Gi"
+            cpu: "500m"
+          limits:
+            memory: "8Gi"
+            cpu: "2000m"
+      restartPolicy: Never
+  backoffLimit: 3
+EOF
+
+echo "Cleaning up any existing GuideLLM benchmark job..."
+kubectl delete job $JOB_NAME 2>/dev/null || true
+
+echo "Deploying GuideLLM benchmark Job..."
+kubectl apply -f "$TEMP_YAML"
+
+echo "Waiting for job to start..."
+kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=120s
+
+# Prepare file names and create results directory
+mkdir -p results
+if [[ -z "$OUTPUT_FILE" ]]; then
+    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+    OUTPUT_FILE="results/guidellm-benchmark-${TARGET}-${TIMESTAMP}.txt"
+fi
+
+echo "Following GuideLLM benchmark logs..."
+kubectl logs -f job/$JOB_NAME
+
+echo "Job completed. Checking final status..."
+kubectl get job $JOB_NAME
+
+# Save benchmark results using kubectl logs
+echo "Saving benchmark results..."
+kubectl logs job/$JOB_NAME > "$OUTPUT_FILE"
+
+echo "Benchmark output saved to: $OUTPUT_FILE"
+
+# Clean up temporary file
+rm -f "$TEMP_YAML"