mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-21 17:33:12 +00:00
# What does this PR do? 1. Add our own benchmark script instead of locust (doesn't support measuring streaming latency well) 2. Simplify k8s deployment 3. Add a simple profile script for locally running server ## Test Plan ❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10 ============================================================ BENCHMARK RESULTS ============================================================ Total time: 180.00s Concurrent users: 10 Total requests: 1636 Successful requests: 1636 Failed requests: 0 Success rate: 100.0% Requests per second: 9.09 Response Time Statistics: Mean: 1.095s Median: 1.721s Min: 0.136s Max: 3.218s Std Dev: 0.762s Percentiles: P50: 1.721s P90: 1.751s P95: 1.756s P99: 1.796s Time to First Token (TTFT) Statistics: Mean: 0.037s Median: 0.037s Min: 0.023s Max: 0.211s Std Dev: 0.011s TTFT Percentiles: P50: 0.037s P90: 0.040s P95: 0.044s P99: 0.055s Streaming Statistics: Mean chunks per response: 64.0 Total chunks received: 104775
52 lines
1.7 KiB
Bash
Executable file
52 lines
1.7 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
# Script to profile an already running Llama Stack server
|
|
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
|
|
|
|
DURATION=${1:-60} # Default 60 seconds
|
|
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
|
|
|
|
echo "Looking for running Llama Stack server..."
|
|
|
|
# Find the server PID
|
|
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
|
|
|
|
|
|
if [ -z "$SERVER_PID" ]; then
|
|
echo "Error: No running Llama Stack server found"
|
|
echo "Please start your server first with:"
|
|
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Found Llama Stack server with PID: $SERVER_PID"
|
|
|
|
# Start py-spy profiling
|
|
echo "Starting py-spy profiling for ${DURATION} seconds..."
|
|
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
|
|
echo ""
|
|
echo "You can now run your load test..."
|
|
echo ""
|
|
|
|
# Get the full path to py-spy
|
|
PYSPY_PATH=$(which py-spy)
|
|
|
|
# Check if running as root, if not, use sudo
|
|
if [ "$EUID" -ne 0 ]; then
|
|
echo "py-spy requires root permissions on macOS. Running with sudo..."
|
|
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
|
else
|
|
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
|
fi
|
|
|
|
echo ""
|
|
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
|
|
echo ""
|
|
echo "To view the flame graph:"
|
|
echo "open ${OUTPUT_FILE}.svg"
|