mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-16 14:38:00 +00:00
chore: setup for performance benchmarking (#3096)
# What does this PR do? 1. Added a simple mock openai-compat server that serves chat/completion 2. Add a benchmark server in EKS that includes mock inference server 3. Add locust (https://locust.io/) file for load testing ## Test Plan bash apply.sh kubectl port-forward service/locust-web-ui 8089:8089 Go to localhost:8089 to start a load test <img width="1392" height="334" alt="image" src="https://github.com/user-attachments/assets/d6aa3deb-583a-42ed-889b-751262b8e91c" /> <img width="1362" height="881" alt="image" src="https://github.com/user-attachments/assets/6a28b9b4-05e6-44e2-b504-07e60c12d35e" />
This commit is contained in:
parent
2f51273215
commit
d6ae54723d
11 changed files with 1234 additions and 3 deletions
78
docs/source/distributions/k8s-benchmark/locustfile.py
Normal file
78
docs/source/distributions/k8s-benchmark/locustfile.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
|
||||
"""
|
||||
|
||||
import random
|
||||
from locust import HttpUser, task, between
|
||||
import os
|
||||
|
||||
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
|
||||
|
||||
MODEL_ID = os.getenv("INFERENCE_MODEL")
|
||||
|
||||
class LlamaStackUser(HttpUser):
|
||||
wait_time = between(0.0, 0.0001)
|
||||
|
||||
def on_start(self):
|
||||
"""Setup authentication and test data."""
|
||||
# No auth required for benchmark server
|
||||
self.headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Test messages of varying lengths
|
||||
self.test_messages = [
|
||||
[{"role": "user", "content": "Hi"}],
|
||||
[{"role": "user", "content": "What is the capital of France?"}],
|
||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
||||
[
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||
{"role": "user", "content": "Can you give me a practical example?"}
|
||||
]
|
||||
]
|
||||
|
||||
@task(weight=100)
|
||||
def chat_completion_streaming(self):
|
||||
"""Test streaming chat completion (20% of requests)."""
|
||||
messages = random.choice(self.test_messages)
|
||||
payload = {
|
||||
"model": MODEL_ID,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"max_tokens": 100
|
||||
}
|
||||
|
||||
with self.client.post(
|
||||
f"{base_path}/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
stream=True,
|
||||
catch_response=True
|
||||
) as response:
|
||||
if response.status_code == 200:
|
||||
chunks_received = 0
|
||||
try:
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
line_str = line.decode('utf-8')
|
||||
if line_str.startswith('data: '):
|
||||
chunks_received += 1
|
||||
if line_str.strip() == 'data: [DONE]':
|
||||
break
|
||||
|
||||
if chunks_received > 0:
|
||||
response.success()
|
||||
else:
|
||||
response.failure("No streaming chunks received")
|
||||
except Exception as e:
|
||||
response.failure(f"Streaming error: {e}")
|
||||
else:
|
||||
response.failure(f"HTTP {response.status_code}: {response.text}")
|
Loading…
Add table
Add a link
Reference in a new issue