mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore: move benchmarking related code (#3406)
# What does this PR do? - moving things and some formatting changes ## Test Plan
This commit is contained in:
parent
d2f88a10fb
commit
c04f1c1e8c
10 changed files with 156 additions and 149 deletions
202
benchmarking/k8s-benchmark/openai-mock-server.py
Executable file
202
benchmarking/k8s-benchmark/openai-mock-server.py
Executable file
|
@ -0,0 +1,202 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
OpenAI-compatible mock server that returns:
|
||||
- Hardcoded /models response for consistent validation
|
||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from flask import Flask, Response, jsonify, request
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
# Models from environment variables
|
||||
def get_models():
|
||||
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def generate_random_text(length=50):
|
||||
"""Generate random but coherent text for responses."""
|
||||
words = [
|
||||
"Hello",
|
||||
"there",
|
||||
"I'm",
|
||||
"an",
|
||||
"AI",
|
||||
"assistant",
|
||||
"ready",
|
||||
"to",
|
||||
"help",
|
||||
"you",
|
||||
"with",
|
||||
"your",
|
||||
"questions",
|
||||
"and",
|
||||
"tasks",
|
||||
"today",
|
||||
"Let",
|
||||
"me",
|
||||
"know",
|
||||
"what",
|
||||
"you'd",
|
||||
"like",
|
||||
"to",
|
||||
"discuss",
|
||||
"or",
|
||||
"explore",
|
||||
"together",
|
||||
"I",
|
||||
"can",
|
||||
"assist",
|
||||
"with",
|
||||
"various",
|
||||
"topics",
|
||||
"including",
|
||||
"coding",
|
||||
"writing",
|
||||
"analysis",
|
||||
"and",
|
||||
"more",
|
||||
]
|
||||
return " ".join(random.choices(words, k=length))
|
||||
|
||||
|
||||
@app.route("/v1/models", methods=["GET"])
|
||||
def list_models():
|
||||
models = get_models()
|
||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||
return jsonify(models)
|
||||
|
||||
|
||||
@app.route("/v1/chat/completions", methods=["POST"])
|
||||
def chat_completions():
|
||||
"""Return OpenAI-formatted chat completion responses."""
|
||||
data = request.get_json()
|
||||
default_model = get_models()["data"][0]["id"]
|
||||
model = data.get("model", default_model)
|
||||
messages = data.get("messages", [])
|
||||
stream = data.get("stream", False)
|
||||
|
||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
||||
|
||||
if stream:
|
||||
return handle_streaming_completion(model, messages)
|
||||
else:
|
||||
return handle_non_streaming_completion(model, messages)
|
||||
|
||||
|
||||
def handle_non_streaming_completion(model, messages):
|
||||
response_text = generate_random_text(random.randint(20, 80))
|
||||
|
||||
# Calculate realistic token counts
|
||||
prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
|
||||
completion_tokens = len(response_text.split())
|
||||
|
||||
response = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
|
||||
"usage": {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
|
||||
def handle_streaming_completion(model, messages):
|
||||
def generate_stream():
|
||||
# Generate response text
|
||||
full_response = generate_random_text(random.randint(30, 100))
|
||||
words = full_response.split()
|
||||
|
||||
# Send initial chunk
|
||||
initial_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
|
||||
}
|
||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
||||
|
||||
# Send word by word
|
||||
for i, word in enumerate(words):
|
||||
chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
|
||||
}
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
# Configurable delay to simulate realistic streaming
|
||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
||||
time.sleep(stream_delay)
|
||||
|
||||
# Send final chunk
|
||||
final_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
|
||||
}
|
||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return Response(
|
||||
generate_stream(),
|
||||
mimetype="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.route("/health", methods=["GET"])
|
||||
def health():
|
||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
|
||||
parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
|
||||
args = parser.parse_args()
|
||||
|
||||
port = args.port
|
||||
|
||||
models = get_models()
|
||||
print("Starting OpenAI-compatible mock server...")
|
||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
||||
print("- Streaming support with valid SSE format")
|
||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
||||
app.run(host="0.0.0.0", port=port, debug=False)
|
Loading…
Add table
Add a link
Reference in a new issue