llama-stack-mirror/docs/source/distributions/k8s-benchmark/locustfile.py
Eric Huang 9db924adae openai mock server, Split of "k8s bench, locust"
# What does this PR do?


## Test Plan
# What does this PR do?


## Test Plan
# What does this PR do?


## Test Plan
2025-08-11 16:34:51 -07:00

78 lines
2.8 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
"""
import random
from locust import HttpUser, task, between
import os
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
MODEL_ID = os.getenv("INFERENCE_MODEL")
class LlamaStackUser(HttpUser):
wait_time = between(0.0, 0.0001)
def on_start(self):
"""Setup authentication and test data."""
# No auth required for benchmark server
self.headers = {
"Content-Type": "application/json"
}
# Test messages of varying lengths
self.test_messages = [
[{"role": "user", "content": "Hi"}],
[{"role": "user", "content": "What is the capital of France?"}],
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
[
{"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"}
]
]
@task(weight=100)
def chat_completion_streaming(self):
"""Test streaming chat completion (20% of requests)."""
messages = random.choice(self.test_messages)
payload = {
"model": MODEL_ID,
"messages": messages,
"stream": True,
"max_tokens": 100
}
with self.client.post(
f"{base_path}/chat/completions",
headers=self.headers,
json=payload,
stream=True,
catch_response=True
) as response:
if response.status_code == 200:
chunks_received = 0
try:
for line in response.iter_lines():
if line:
line_str = line.decode('utf-8')
if line_str.startswith('data: '):
chunks_received += 1
if line_str.strip() == 'data: [DONE]':
break
if chunks_received > 0:
response.success()
else:
response.failure("No streaming chunks received")
except Exception as e:
response.failure(f"Streaming error: {e}")
else:
response.failure(f"HTTP {response.status_code}: {response.text}")