test: benchmark scripts (#3160)

# What does this PR do?
1. Add our own benchmark script instead of locust (doesn't support
measuring streaming latency well)
2. Simplify k8s deployment
3. Add a simple profile script for locally running server

## Test Plan
❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10

============================================================
BENCHMARK RESULTS
============================================================
Total time: 180.00s
Concurrent users: 10
Total requests: 1636
Successful requests: 1636
Failed requests: 0
Success rate: 100.0%
Requests per second: 9.09

Response Time Statistics:
  Mean: 1.095s
  Median: 1.721s
  Min: 0.136s
  Max: 3.218s
  Std Dev: 0.762s

Percentiles:
  P50: 1.721s
  P90: 1.751s
  P95: 1.756s
  P99: 1.796s

Time to First Token (TTFT) Statistics:
  Mean: 0.037s
  Median: 0.037s
  Min: 0.023s
  Max: 0.211s
  Std Dev: 0.011s

TTFT Percentiles:
  P50: 0.037s
  P90: 0.040s
  P95: 0.044s
  P99: 0.055s

Streaming Statistics:
  Mean chunks per response: 64.0
  Total chunks received: 104775
This commit is contained in:
ehhuang 2025-08-15 11:24:29 -07:00 committed by GitHub
parent 2114214fe3
commit 2c06b24c77
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 633 additions and 328 deletions

View file

@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
@ -16,20 +15,6 @@ providers:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: mock-vllm-inference
provider_type: remote::vllm
config:
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
max_tokens: 4096
api_token: fake
tls_verify: false
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -45,11 +30,6 @@ providers:
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
@ -115,14 +95,6 @@ models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
- model_id: ${env.MOCK_INFERENCE_MODEL}
provider_id: mock-vllm-inference
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []