chore: move benchmarking related code (#3406)

# What does this PR do?
- moving things and some formatting changes


## Test Plan
This commit is contained in:
ehhuang 2025-09-10 13:19:44 -07:00 committed by GitHub
parent d2f88a10fb
commit c04f1c1e8c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 156 additions and 149 deletions

View file

@ -0,0 +1,202 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
OpenAI-compatible mock server that returns:
- Hardcoded /models response for consistent validation
- Valid OpenAI-formatted chat completion responses with dynamic content
"""
import argparse
import json
import os
import random
import time
import uuid
from flask import Flask, Response, jsonify, request
app = Flask(__name__)
# Models from environment variables
def get_models():
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
return {
"object": "list",
"data": [
{"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
],
}
def generate_random_text(length=50):
"""Generate random but coherent text for responses."""
words = [
"Hello",
"there",
"I'm",
"an",
"AI",
"assistant",
"ready",
"to",
"help",
"you",
"with",
"your",
"questions",
"and",
"tasks",
"today",
"Let",
"me",
"know",
"what",
"you'd",
"like",
"to",
"discuss",
"or",
"explore",
"together",
"I",
"can",
"assist",
"with",
"various",
"topics",
"including",
"coding",
"writing",
"analysis",
"and",
"more",
]
return " ".join(random.choices(words, k=length))
@app.route("/v1/models", methods=["GET"])
def list_models():
models = get_models()
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
return jsonify(models)
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions():
"""Return OpenAI-formatted chat completion responses."""
data = request.get_json()
default_model = get_models()["data"][0]["id"]
model = data.get("model", default_model)
messages = data.get("messages", [])
stream = data.get("stream", False)
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
if stream:
return handle_streaming_completion(model, messages)
else:
return handle_non_streaming_completion(model, messages)
def handle_non_streaming_completion(model, messages):
response_text = generate_random_text(random.randint(20, 80))
# Calculate realistic token counts
prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
completion_tokens = len(response_text.split())
response = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion",
"created": int(time.time()),
"model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
},
}
return jsonify(response)
def handle_streaming_completion(model, messages):
def generate_stream():
# Generate response text
full_response = generate_random_text(random.randint(30, 100))
words = full_response.split()
# Send initial chunk
initial_chunk = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
}
yield f"data: {json.dumps(initial_chunk)}\n\n"
# Send word by word
for i, word in enumerate(words):
chunk = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
}
yield f"data: {json.dumps(chunk)}\n\n"
# Configurable delay to simulate realistic streaming
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
time.sleep(stream_delay)
# Send final chunk
final_chunk = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
}
yield f"data: {json.dumps(final_chunk)}\n\n"
yield "data: [DONE]\n\n"
return Response(
generate_stream(),
mimetype="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Access-Control-Allow-Origin": "*",
},
)
@app.route("/health", methods=["GET"])
def health():
return jsonify({"status": "healthy", "type": "openai-mock"})
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
args = parser.parse_args()
port = args.port
models = get_models()
print("Starting OpenAI-compatible mock server...")
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
print("- OpenAI-formatted chat/completion responses with dynamic content")
print("- Streaming support with valid SSE format")
print(f"- Listening on: http://0.0.0.0:{port}")
app.run(host="0.0.0.0", port=port, debug=False)