mirror of
				https://github.com/meta-llama/llama-stack.git
				synced 2025-10-26 01:12:59 +00:00 
			
		
		
		
	# What does this PR do? 1. Add our own benchmark script instead of locust (doesn't support measuring streaming latency well) 2. Simplify k8s deployment 3. Add a simple profile script for locally running server ## Test Plan ❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10 ============================================================ BENCHMARK RESULTS ============================================================ Total time: 180.00s Concurrent users: 10 Total requests: 1636 Successful requests: 1636 Failed requests: 0 Success rate: 100.0% Requests per second: 9.09 Response Time Statistics: Mean: 1.095s Median: 1.721s Min: 0.136s Max: 3.218s Std Dev: 0.762s Percentiles: P50: 1.721s P90: 1.751s P95: 1.756s P99: 1.796s Time to First Token (TTFT) Statistics: Mean: 0.037s Median: 0.037s Min: 0.023s Max: 0.211s Std Dev: 0.011s TTFT Percentiles: P50: 0.037s P90: 0.040s P95: 0.044s P99: 0.055s Streaming Statistics: Mean chunks per response: 64.0 Total chunks received: 104775
		
			
				
	
	
		
			52 lines
		
	
	
	
		
			1.7 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			52 lines
		
	
	
	
		
			1.7 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
| #!/bin/bash
 | |
| 
 | |
| # Copyright (c) Meta Platforms, Inc. and affiliates.
 | |
| # All rights reserved.
 | |
| #
 | |
| # This source code is licensed under the terms described in the LICENSE file in
 | |
| # the root directory of this source tree.
 | |
| 
 | |
| # Script to profile an already running Llama Stack server
 | |
| # Usage: ./profile_running_server.sh [duration_seconds] [output_file]
 | |
| 
 | |
| DURATION=${1:-60}  # Default 60 seconds
 | |
| OUTPUT_FILE=${2:-"llama_stack_profile"}  # Default output file
 | |
| 
 | |
| echo "Looking for running Llama Stack server..."
 | |
| 
 | |
| # Find the server PID
 | |
| SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
 | |
| 
 | |
| 
 | |
| if [ -z "$SERVER_PID" ]; then
 | |
|     echo "Error: No running Llama Stack server found"
 | |
|     echo "Please start your server first with:"
 | |
|     echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| echo "Found Llama Stack server with PID: $SERVER_PID"
 | |
| 
 | |
| # Start py-spy profiling
 | |
| echo "Starting py-spy profiling for ${DURATION} seconds..."
 | |
| echo "Output will be saved to: ${OUTPUT_FILE}.svg"
 | |
| echo ""
 | |
| echo "You can now run your load test..."
 | |
| echo ""
 | |
| 
 | |
| # Get the full path to py-spy
 | |
| PYSPY_PATH=$(which py-spy)
 | |
| 
 | |
| # Check if running as root, if not, use sudo
 | |
| if [ "$EUID" -ne 0 ]; then
 | |
|     echo "py-spy requires root permissions on macOS. Running with sudo..."
 | |
|     sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 | |
| else
 | |
|     "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
 | |
| fi
 | |
| 
 | |
| echo ""
 | |
| echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
 | |
| echo ""
 | |
| echo "To view the flame graph:"
 | |
| echo "open ${OUTPUT_FILE}.svg"
 |