mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-21 17:33:12 +00:00
# What does this PR do? 1. Add our own benchmark script instead of locust (doesn't support measuring streaming latency well) 2. Simplify k8s deployment 3. Add a simple profile script for locally running server ## Test Plan ❮ ./run-benchmark.sh --target stack --duration 180 --concurrent 10 ============================================================ BENCHMARK RESULTS ============================================================ Total time: 180.00s Concurrent users: 10 Total requests: 1636 Successful requests: 1636 Failed requests: 0 Success rate: 100.0% Requests per second: 9.09 Response Time Statistics: Mean: 1.095s Median: 1.721s Min: 0.136s Max: 3.218s Std Dev: 0.762s Percentiles: P50: 1.721s P90: 1.751s P95: 1.756s P99: 1.796s Time to First Token (TTFT) Statistics: Mean: 0.037s Median: 0.037s Min: 0.023s Max: 0.211s Std Dev: 0.011s TTFT Percentiles: P50: 0.037s P90: 0.040s P95: 0.044s P99: 0.055s Streaming Statistics: Mean chunks per response: 64.0 Total chunks received: 104775
190 lines
6.2 KiB
Python
Executable file
190 lines
6.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
"""
|
|
OpenAI-compatible mock server that returns:
|
|
- Hardcoded /models response for consistent validation
|
|
- Valid OpenAI-formatted chat completion responses with dynamic content
|
|
"""
|
|
|
|
from flask import Flask, request, jsonify, Response
|
|
import time
|
|
import random
|
|
import uuid
|
|
import json
|
|
import argparse
|
|
import os
|
|
|
|
app = Flask(__name__)
|
|
|
|
# Models from environment variables
|
|
def get_models():
|
|
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
|
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
|
|
|
return {
|
|
"object": "list",
|
|
"data": [
|
|
{
|
|
"id": model_id,
|
|
"object": "model",
|
|
"created": 1234567890,
|
|
"owned_by": "vllm"
|
|
}
|
|
for model_id in model_ids
|
|
]
|
|
}
|
|
|
|
def generate_random_text(length=50):
|
|
"""Generate random but coherent text for responses."""
|
|
words = [
|
|
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
|
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
|
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
|
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
|
]
|
|
return " ".join(random.choices(words, k=length))
|
|
|
|
@app.route('/v1/models', methods=['GET'])
|
|
def list_models():
|
|
models = get_models()
|
|
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
|
return jsonify(models)
|
|
|
|
@app.route('/v1/chat/completions', methods=['POST'])
|
|
def chat_completions():
|
|
"""Return OpenAI-formatted chat completion responses."""
|
|
data = request.get_json()
|
|
default_model = get_models()['data'][0]['id']
|
|
model = data.get('model', default_model)
|
|
messages = data.get('messages', [])
|
|
stream = data.get('stream', False)
|
|
|
|
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
|
|
|
if stream:
|
|
return handle_streaming_completion(model, messages)
|
|
else:
|
|
return handle_non_streaming_completion(model, messages)
|
|
|
|
def handle_non_streaming_completion(model, messages):
|
|
response_text = generate_random_text(random.randint(20, 80))
|
|
|
|
# Calculate realistic token counts
|
|
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
|
completion_tokens = len(response_text.split())
|
|
|
|
response = {
|
|
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
"object": "chat.completion",
|
|
"created": int(time.time()),
|
|
"model": model,
|
|
"choices": [
|
|
{
|
|
"index": 0,
|
|
"message": {
|
|
"role": "assistant",
|
|
"content": response_text
|
|
},
|
|
"finish_reason": "stop"
|
|
}
|
|
],
|
|
"usage": {
|
|
"prompt_tokens": prompt_tokens,
|
|
"completion_tokens": completion_tokens,
|
|
"total_tokens": prompt_tokens + completion_tokens
|
|
}
|
|
}
|
|
|
|
return jsonify(response)
|
|
|
|
def handle_streaming_completion(model, messages):
|
|
def generate_stream():
|
|
# Generate response text
|
|
full_response = generate_random_text(random.randint(30, 100))
|
|
words = full_response.split()
|
|
|
|
# Send initial chunk
|
|
initial_chunk = {
|
|
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
"object": "chat.completion.chunk",
|
|
"created": int(time.time()),
|
|
"model": model,
|
|
"choices": [
|
|
{
|
|
"index": 0,
|
|
"delta": {"role": "assistant", "content": ""}
|
|
}
|
|
]
|
|
}
|
|
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
|
|
|
# Send word by word
|
|
for i, word in enumerate(words):
|
|
chunk = {
|
|
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
"object": "chat.completion.chunk",
|
|
"created": int(time.time()),
|
|
"model": model,
|
|
"choices": [
|
|
{
|
|
"index": 0,
|
|
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
|
}
|
|
]
|
|
}
|
|
yield f"data: {json.dumps(chunk)}\n\n"
|
|
# Configurable delay to simulate realistic streaming
|
|
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
|
time.sleep(stream_delay)
|
|
|
|
# Send final chunk
|
|
final_chunk = {
|
|
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
"object": "chat.completion.chunk",
|
|
"created": int(time.time()),
|
|
"model": model,
|
|
"choices": [
|
|
{
|
|
"index": 0,
|
|
"delta": {"content": ""},
|
|
"finish_reason": "stop"
|
|
}
|
|
]
|
|
}
|
|
yield f"data: {json.dumps(final_chunk)}\n\n"
|
|
yield "data: [DONE]\n\n"
|
|
|
|
return Response(
|
|
generate_stream(),
|
|
mimetype='text/event-stream',
|
|
headers={
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Access-Control-Allow-Origin': '*',
|
|
}
|
|
)
|
|
|
|
@app.route('/health', methods=['GET'])
|
|
def health():
|
|
return jsonify({"status": "healthy", "type": "openai-mock"})
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
|
parser.add_argument('--port', type=int, default=8081,
|
|
help='Port to run the server on (default: 8081)')
|
|
args = parser.parse_args()
|
|
|
|
port = args.port
|
|
|
|
models = get_models()
|
|
print("Starting OpenAI-compatible mock server...")
|
|
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
|
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
|
print("- Streaming support with valid SSE format")
|
|
print(f"- Listening on: http://0.0.0.0:{port}")
|
|
app.run(host='0.0.0.0', port=port, debug=False)
|