chore: move benchmarking related code (#3406)

# What does this PR do? - moving things and some formatting changes ## Test Plan
2025-12-03 09:53:45 +00:00 · 2025-09-10 13:19:44 -07:00 · 2025-09-10 13:19:44 -07:00 · c04f1c1e8c
commit c04f1c1e8c
parent d2f88a10fb
10 changed files with 156 additions and 149 deletions
--- a/benchmarking/k8s-benchmark/openai-mock-server.py
+++ b/benchmarking/k8s-benchmark/openai-mock-server.py
@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+OpenAI-compatible mock server that returns:
+- Hardcoded /models response for consistent validation
+- Valid OpenAI-formatted chat completion responses with dynamic content
+"""
+
+import argparse
+import json
+import os
+import random
+import time
+import uuid
+
+from flask import Flask, Response, jsonify, request
+
+app = Flask(__name__)
+
+
+# Models from environment variables
+def get_models():
+    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
+    model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
+
+    return {
+        "object": "list",
+        "data": [
+            {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
+        ],
+    }
+
+
+def generate_random_text(length=50):
+    """Generate random but coherent text for responses."""
+    words = [
+        "Hello",
+        "there",
+        "I'm",
+        "an",
+        "AI",
+        "assistant",
+        "ready",
+        "to",
+        "help",
+        "you",
+        "with",
+        "your",
+        "questions",
+        "and",
+        "tasks",
+        "today",
+        "Let",
+        "me",
+        "know",
+        "what",
+        "you'd",
+        "like",
+        "to",
+        "discuss",
+        "or",
+        "explore",
+        "together",
+        "I",
+        "can",
+        "assist",
+        "with",
+        "various",
+        "topics",
+        "including",
+        "coding",
+        "writing",
+        "analysis",
+        "and",
+        "more",
+    ]
+    return " ".join(random.choices(words, k=length))
+
+
+@app.route("/v1/models", methods=["GET"])
+def list_models():
+    models = get_models()
+    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
+    return jsonify(models)
+
+
+@app.route("/v1/chat/completions", methods=["POST"])
+def chat_completions():
+    """Return OpenAI-formatted chat completion responses."""
+    data = request.get_json()
+    default_model = get_models()["data"][0]["id"]
+    model = data.get("model", default_model)
+    messages = data.get("messages", [])
+    stream = data.get("stream", False)
+
+    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
+
+    if stream:
+        return handle_streaming_completion(model, messages)
+    else:
+        return handle_non_streaming_completion(model, messages)
+
+
+def handle_non_streaming_completion(model, messages):
+    response_text = generate_random_text(random.randint(20, 80))
+
+    # Calculate realistic token counts
+    prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
+    completion_tokens = len(response_text.split())
+
+    response = {
+        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
+    }
+
+    return jsonify(response)
+
+
+def handle_streaming_completion(model, messages):
+    def generate_stream():
+        # Generate response text
+        full_response = generate_random_text(random.randint(30, 100))
+        words = full_response.split()
+
+        # Send initial chunk
+        initial_chunk = {
+            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
+        }
+        yield f"data: {json.dumps(initial_chunk)}\n\n"
+
+        # Send word by word
+        for i, word in enumerate(words):
+            chunk = {
+                "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": model,
+                "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
+            }
+            yield f"data: {json.dumps(chunk)}\n\n"
+            # Configurable delay to simulate realistic streaming
+            stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
+            time.sleep(stream_delay)
+
+        # Send final chunk
+        final_chunk = {
+            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
+        }
+        yield f"data: {json.dumps(final_chunk)}\n\n"
+        yield "data: [DONE]\n\n"
+
+    return Response(
+        generate_stream(),
+        mimetype="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Access-Control-Allow-Origin": "*",
+        },
+    )
+
+
+@app.route("/health", methods=["GET"])
+def health():
+    return jsonify({"status": "healthy", "type": "openai-mock"})
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
+    parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
+    args = parser.parse_args()
+
+    port = args.port
+
+    models = get_models()
+    print("Starting OpenAI-compatible mock server...")
+    print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
+    print("- OpenAI-formatted chat/completion responses with dynamic content")
+    print("- Streaming support with valid SSE format")
+    print(f"- Listening on: http://0.0.0.0:{port}")
+    app.run(host="0.0.0.0", port=port, debug=False)