mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
Some checks failed
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 3s
Unit Tests / unit-tests (3.13) (push) Failing after 3s
Update ReadTheDocs / update-readthedocs (push) Failing after 3s
Test Llama Stack Build / build (push) Failing after 3s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 1s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Test Llama Stack Build / generate-matrix (push) Successful in 3s
Python Package Build Test / build (3.12) (push) Failing after 1s
Python Package Build Test / build (3.13) (push) Failing after 2s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 3s
Test Llama Stack Build / build-single-provider (push) Failing after 3s
Vector IO Integration Tests / test-matrix (push) Failing after 5s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 4s
API Conformance Tests / check-schema-compatibility (push) Successful in 8s
Test External API and Providers / test-external (venv) (push) Failing after 3s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
UI Tests / ui-tests (22) (push) Successful in 40s
Pre-commit / pre-commit (push) Successful in 1m9s
# What does this PR do? - Mostly AI-generated scripts to run guidellm (https://github.com/vllm-project/guidellm) benchmarks on k8s setup - Stack is using image built from main on 9/11 ## Test Plan See updated README.md
219 lines
6.4 KiB
Bash
Executable file
219 lines
6.4 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
set -euo pipefail
|
|
|
|
# Default values
|
|
TARGET="stack"
|
|
MAX_SECONDS=60
|
|
PROMPT_TOKENS=512
|
|
OUTPUT_TOKENS=256
|
|
RATE_TYPE="concurrent"
|
|
RATE="1,2,4,8,16,32,64,128"
|
|
STACK_DEPLOYMENT="llama-stack-benchmark-server"
|
|
STACK_URL="http://llama-stack-benchmark-service:8323/v1/openai"
|
|
VLLM_DEPLOYMENT="vllm-server"
|
|
OUTPUT_FILE=""
|
|
|
|
# Parse command line arguments
|
|
usage() {
|
|
echo "Usage: $0 [options]"
|
|
echo "Options:"
|
|
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
|
echo " -s, --max-seconds <seconds> Maximum duration in seconds (default: 60)"
|
|
echo " -p, --prompt-tokens <tokens> Number of prompt tokens (default: 512)"
|
|
echo " -o, --output-tokens <tokens> Number of output tokens (default: 256)"
|
|
echo " -r, --rate-type <type> Rate type (default: concurrent)"
|
|
echo " -c, --rate Rate (default: 1,2,4,8,16,32,64,128)"
|
|
echo " --output-file <path> Output file path (default: auto-generated)"
|
|
echo " --stack-deployment <name> Name of the stack deployment (default: llama-stack-benchmark-server)"
|
|
echo " --vllm-deployment <name> Name of the vllm deployment (default: vllm-server)"
|
|
echo " --stack-url <url> URL of the stack service (default: http://llama-stack-benchmark-service:8323/v1/openai)"
|
|
echo " -h, --help Show this help message"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 --target vllm # Benchmark vLLM direct"
|
|
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
|
echo " $0 -t vllm -s 60 -p 512 -o 256 # vLLM with custom parameters"
|
|
echo " $0 --output-file results/my-benchmark.txt # Specify custom output file"
|
|
echo " $0 --stack-deployment my-stack-server # Use custom stack deployment name"
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-t|--target)
|
|
TARGET="$2"
|
|
shift 2
|
|
;;
|
|
-s|--max-seconds)
|
|
MAX_SECONDS="$2"
|
|
shift 2
|
|
;;
|
|
-p|--prompt-tokens)
|
|
PROMPT_TOKENS="$2"
|
|
shift 2
|
|
;;
|
|
-o|--output-tokens)
|
|
OUTPUT_TOKENS="$2"
|
|
shift 2
|
|
;;
|
|
-r|--rate-type)
|
|
RATE_TYPE="$2"
|
|
shift 2
|
|
;;
|
|
-c|--rate)
|
|
RATE="$2"
|
|
shift 2
|
|
;;
|
|
--output-file)
|
|
OUTPUT_FILE="$2"
|
|
shift 2
|
|
;;
|
|
--stack-deployment)
|
|
STACK_DEPLOYMENT="$2"
|
|
shift 2
|
|
;;
|
|
--vllm-deployment)
|
|
VLLM_DEPLOYMENT="$2"
|
|
shift 2
|
|
;;
|
|
--stack-url)
|
|
STACK_URL="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1"
|
|
usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Validate target
|
|
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
|
echo "Error: Target must be 'stack' or 'vllm'"
|
|
usage
|
|
exit 1
|
|
fi
|
|
|
|
# Set configuration based on target
|
|
if [[ "$TARGET" == "vllm" ]]; then
|
|
BASE_URL="http://${VLLM_DEPLOYMENT}:8000"
|
|
JOB_NAME="guidellm-vllm-benchmark-job"
|
|
echo "Benchmarking vLLM direct with GuideLLM..."
|
|
else
|
|
BASE_URL="$STACK_URL"
|
|
JOB_NAME="guidellm-stack-benchmark-job"
|
|
echo "Benchmarking Llama Stack with GuideLLM..."
|
|
fi
|
|
|
|
|
|
echo "Configuration:"
|
|
echo " Target: $TARGET"
|
|
echo " Base URL: $BASE_URL"
|
|
echo " Max seconds: ${MAX_SECONDS}s"
|
|
echo " Prompt tokens: $PROMPT_TOKENS"
|
|
echo " Output tokens: $OUTPUT_TOKENS"
|
|
echo " Rate type: $RATE_TYPE"
|
|
if [[ "$TARGET" == "vllm" ]]; then
|
|
echo " vLLM deployment: $VLLM_DEPLOYMENT"
|
|
else
|
|
echo " Stack deployment: $STACK_DEPLOYMENT"
|
|
fi
|
|
echo ""
|
|
|
|
# Create temporary job yaml
|
|
TEMP_YAML="/tmp/guidellm-benchmark-job-temp-$(date +%s).yaml"
|
|
cat > "$TEMP_YAML" << EOF
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: $JOB_NAME
|
|
namespace: default
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: guidellm-benchmark
|
|
image: python:3.11-slim
|
|
command: ["/bin/bash"]
|
|
args:
|
|
- "-c"
|
|
- |
|
|
# Install uv and guidellm
|
|
pip install uv &&
|
|
uv pip install --system guidellm &&
|
|
|
|
# Login to HuggingFace
|
|
uv pip install --system huggingface_hub &&
|
|
python -c "from huggingface_hub import login; login(token='\$HF_TOKEN')" &&
|
|
|
|
# Run GuideLLM benchmark and save output
|
|
export COLUMNS=200
|
|
GUIDELLM__PREFERRED_ROUTE="chat_completions" uv run guidellm benchmark run \\
|
|
--target "$BASE_URL" \\
|
|
--rate-type "$RATE_TYPE" \\
|
|
--max-seconds $MAX_SECONDS \\
|
|
--data "prompt_tokens=$PROMPT_TOKENS,output_tokens=$OUTPUT_TOKENS" \\
|
|
--model "$INFERENCE_MODEL" \\
|
|
--rate "$RATE" \\
|
|
--warmup-percent 0.05 \\
|
|
2>&1
|
|
env:
|
|
- name: INFERENCE_MODEL
|
|
value: "meta-llama/Llama-3.2-3B-Instruct"
|
|
- name: HF_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: hf-token-secret
|
|
key: token
|
|
resources:
|
|
requests:
|
|
memory: "4Gi"
|
|
cpu: "500m"
|
|
limits:
|
|
memory: "8Gi"
|
|
cpu: "2000m"
|
|
restartPolicy: Never
|
|
backoffLimit: 3
|
|
EOF
|
|
|
|
echo "Cleaning up any existing GuideLLM benchmark job..."
|
|
kubectl delete job $JOB_NAME 2>/dev/null || true
|
|
|
|
echo "Deploying GuideLLM benchmark Job..."
|
|
kubectl apply -f "$TEMP_YAML"
|
|
|
|
echo "Waiting for job to start..."
|
|
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=120s
|
|
|
|
# Prepare file names and create results directory
|
|
mkdir -p results
|
|
if [[ -z "$OUTPUT_FILE" ]]; then
|
|
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
|
OUTPUT_FILE="results/guidellm-benchmark-${TARGET}-${TIMESTAMP}.txt"
|
|
fi
|
|
|
|
echo "Following GuideLLM benchmark logs..."
|
|
kubectl logs -f job/$JOB_NAME
|
|
|
|
echo "Job completed. Checking final status..."
|
|
kubectl get job $JOB_NAME
|
|
|
|
# Save benchmark results using kubectl logs
|
|
echo "Saving benchmark results..."
|
|
kubectl logs job/$JOB_NAME > "$OUTPUT_FILE"
|
|
|
|
echo "Benchmark output saved to: $OUTPUT_FILE"
|
|
|
|
# Clean up temporary file
|
|
rm -f "$TEMP_YAML"
|