chore: move benchmarking related code (#3406)

# What does this PR do? - moving things and some formatting changes ## Test Plan
2025-12-03 18:00:36 +00:00 · 2025-09-10 13:19:44 -07:00 · 2025-09-10 13:19:44 -07:00 · c04f1c1e8c
commit c04f1c1e8c
parent d2f88a10fb
10 changed files with 156 additions and 149 deletions
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
 **1. Deploy base k8s infrastructure:**
 ```bash
-cd ../k8s
+cd ../../docs/source/distributions/k8s
 ./apply.sh
 ```
 **2. Deploy benchmark components:**
 ```bash
 cd ../k8s-benchmark
 ./apply.sh
 ```
@ -56,7 +55,6 @@ kubectl get pods
 **Benchmark Llama Stack (default):**
 ```bash
 cd docs/source/distributions/k8s-benchmark/
 ./run-benchmark.sh
 ```
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -14,7 +14,7 @@ import os
 import random
 import statistics
 import time
-from typing import Tuple
+
 import aiohttp
@ -57,9 +57,9 @@ class BenchmarkStats:
        success_rate = (self.success_count / self.total_requests) * 100
        print(f"\n{'=' * 60}")
-        print(f"BENCHMARK RESULTS")
+        print("BENCHMARK RESULTS")
-        print(f"\nResponse Time Statistics:")
+        print("\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
        print(f"  Median: {statistics.median(self.response_times):.3f}s")
        print(f"  Min: {min(self.response_times):.3f}s")
@ -70,14 +70,14 @@ class BenchmarkStats:
        percentiles = [50, 90, 95, 99]
        sorted_times = sorted(self.response_times)
-        print(f"\nPercentiles:")
+        print("\nPercentiles:")
        for p in percentiles:
            idx = int(len(sorted_times) * p / 100) - 1
            idx = max(0, min(idx, len(sorted_times) - 1))
            print(f"  P{p}: {sorted_times[idx]:.3f}s")
        if self.ttft_times:
-            print(f"\nTime to First Token (TTFT) Statistics:")
+            print("\nTime to First Token (TTFT) Statistics:")
            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
            print(f"  Min: {min(self.ttft_times):.3f}s")
@ -87,14 +87,14 @@ class BenchmarkStats:
                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
            sorted_ttft = sorted(self.ttft_times)
-            print(f"\nTTFT Percentiles:")
+            print("\nTTFT Percentiles:")
            for p in percentiles:
                idx = int(len(sorted_ttft) * p / 100) - 1
                idx = max(0, min(idx, len(sorted_ttft) - 1))
                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
        if self.chunks_received:
-            print(f"\nStreaming Statistics:")
+            print("\nStreaming Statistics:")
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
@ -108,14 +108,14 @@ class BenchmarkStats:
        print(f"Requests per second: {self.success_count / total_time:.2f}")
        if self.errors:
-            print(f"\nErrors (showing first 5):")
+            print("\nErrors (showing first 5):")
            for error in self.errors[:5]:
                print(f"  {error}")
 class LlamaStackBenchmark:
    def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip('/')
+        self.base_url = base_url.rstrip("/")
        self.model_id = model_id
        self.headers = {"Content-Type": "application/json"}
        self.test_messages = [
@ -126,20 +126,14 @@ class LlamaStackBenchmark:
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"}
+                {"role": "user", "content": "Can you give me a practical example?"},
-            ]
+            ],
        ]
-
+    async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
        """Make a single async streaming chat completion request."""
        messages = random.choice(self.test_messages)
-        payload = {
+        payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
            "model": self.model_id,
            "messages": messages,
            "stream": True,
            "max_tokens": 100
        }
        start_time = time.time()
        chunks_received = 0
@ -153,17 +147,17 @@ class LlamaStackBenchmark:
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
-                timeout=aiohttp.ClientTimeout(total=30)
+                timeout=aiohttp.ClientTimeout(total=30),
            ) as response:
                if response.status == 200:
                    async for line in response.content:
                        if line:
-                            line_str = line.decode('utf-8').strip()
+                            line_str = line.decode("utf-8").strip()
-                            if line_str.startswith('data: '):
+                            if line_str.startswith("data: "):
                                chunks_received += 1
                                if ttft is None:
                                    ttft = time.time() - start_time
-                                if line_str == 'data: [DONE]':
+                                if line_str == "data: [DONE]":
                                    break
                    if chunks_received == 0:
@ -180,7 +174,6 @@ class LlamaStackBenchmark:
        response_time = time.time() - start_time
        return response_time, chunks_received, ttft, error
    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
        """Run benchmark using async requests for specified duration."""
        stats = BenchmarkStats()
@ -192,7 +185,7 @@ class LlamaStackBenchmark:
        print(f"Model: {self.model_id}")
        connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector) as session:
+        async with aiohttp.ClientSession(connector=connector):
            async def worker(worker_id: int):
                """Worker that sends requests sequentially until canceled."""
@ -216,7 +209,9 @@ class LlamaStackBenchmark:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
+                            print(
                                f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
                            )
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break
@ -241,14 +236,16 @@ class LlamaStackBenchmark:
 def main():
    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
+    parser.add_argument(
-                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
+        "--base-url",
-    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
+        default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-                       help="Model ID to use for requests")
+        help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
-    parser.add_argument("--duration", type=int, default=60,
+    )
-                       help="Duration in seconds to run benchmark (default: 60)")
+    parser.add_argument(
-    parser.add_argument("--concurrent", type=int, default=10,
+        "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
-                       help="Number of concurrent users (default: 10)")
+    )
    parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
    parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
    args = parser.parse_args()
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -11,16 +11,18 @@ OpenAI-compatible mock server that returns:
 - Valid OpenAI-formatted chat completion responses with dynamic content
 """
 from flask import Flask, request, jsonify, Response
 import time
 import random
 import uuid
 import json
 import argparse
 import json
 import os
 import random
 import time
 import uuid
 from flask import Flask, Response, jsonify, request
 app = Flask(__name__)
 # Models from environment variables
 def get_models():
    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
@ -29,40 +31,72 @@ def get_models():
    return {
        "object": "list",
        "data": [
-            {
+            {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
-                "id": model_id,
+        ],
                "object": "model",
                "created": 1234567890,
                "owned_by": "vllm"
            }
            for model_id in model_ids
        ]
    }
 def generate_random_text(length=50):
    """Generate random but coherent text for responses."""
    words = [
-        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
+        "Hello",
-        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
+        "there",
-        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
+        "I'm",
-        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
+        "an",
        "AI",
        "assistant",
        "ready",
        "to",
        "help",
        "you",
        "with",
        "your",
        "questions",
        "and",
        "tasks",
        "today",
        "Let",
        "me",
        "know",
        "what",
        "you'd",
        "like",
        "to",
        "discuss",
        "or",
        "explore",
        "together",
        "I",
        "can",
        "assist",
        "with",
        "various",
        "topics",
        "including",
        "coding",
        "writing",
        "analysis",
        "and",
        "more",
    ]
    return " ".join(random.choices(words, k=length))
-@app.route('/v1/models', methods=['GET'])
+
@app.route("/v1/models", methods=["GET"])
 def list_models():
    models = get_models()
    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
    return jsonify(models)
-@app.route('/v1/chat/completions', methods=['POST'])
+
@app.route("/v1/chat/completions", methods=["POST"])
 def chat_completions():
    """Return OpenAI-formatted chat completion responses."""
    data = request.get_json()
-    default_model = get_models()['data'][0]['id']
+    default_model = get_models()["data"][0]["id"]
-    model = data.get('model', default_model)
+    model = data.get("model", default_model)
-    messages = data.get('messages', [])
+    messages = data.get("messages", [])
-    stream = data.get('stream', False)
+    stream = data.get("stream", False)
    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
@ -71,11 +105,12 @@ def chat_completions():
    else:
        return handle_non_streaming_completion(model, messages)
 def handle_non_streaming_completion(model, messages):
    response_text = generate_random_text(random.randint(20, 80))
    # Calculate realistic token counts
-    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
+    prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
    completion_tokens = len(response_text.split())
    response = {
@ -83,25 +118,17 @@ def handle_non_streaming_completion(model, messages):
        "object": "chat.completion",
        "created": int(time.time()),
        "model": model,
-        "choices": [
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_text
                },
                "finish_reason": "stop"
            }
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
+            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        },
    }
    return jsonify(response)
 def handle_streaming_completion(model, messages):
    def generate_stream():
        # Generate response text
@ -114,12 +141,7 @@ def handle_streaming_completion(model, messages):
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
-            "choices": [
+            "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
                {
                    "index": 0,
                    "delta": {"role": "assistant", "content": ""}
                }
            ]
        }
        yield f"data: {json.dumps(initial_chunk)}\n\n"
@ -130,12 +152,7 @@ def handle_streaming_completion(model, messages):
                "object": "chat.completion.chunk",
                "created": int(time.time()),
                "model": model,
-                "choices": [
+                "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
                    {
                        "index": 0,
                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
                    }
                ]
            }
            yield f"data: {json.dumps(chunk)}\n\n"
            # Configurable delay to simulate realistic streaming
@ -148,35 +165,30 @@ def handle_streaming_completion(model, messages):
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
-            "choices": [
+            "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
                {
                    "index": 0,
                    "delta": {"content": ""},
                    "finish_reason": "stop"
                }
            ]
        }
        yield f"data: {json.dumps(final_chunk)}\n\n"
        yield "data: [DONE]\n\n"
    return Response(
        generate_stream(),
-        mimetype='text/event-stream',
+        mimetype="text/event-stream",
        headers={
-            'Cache-Control': 'no-cache',
+            "Cache-Control": "no-cache",
-            'Connection': 'keep-alive',
+            "Connection": "keep-alive",
-            'Access-Control-Allow-Origin': '*',
+            "Access-Control-Allow-Origin": "*",
-        }
+        },
    )
-@app.route('/health', methods=['GET'])
+
@app.route("/health", methods=["GET"])
 def health():
    return jsonify({"status": "healthy", "type": "openai-mock"})
-if __name__ == '__main__':
+
-    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
+if __name__ == "__main__":
-    parser.add_argument('--port', type=int, default=8081, 
+    parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
-                       help='Port to run the server on (default: 8081)')
+    parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
    args = parser.parse_args()
    port = args.port
@ -187,4 +199,4 @@ if __name__ == '__main__':
    print("- OpenAI-formatted chat/completion responses with dynamic content")
    print("- Streaming support with valid SSE format")
    print(f"- Listening on: http://0.0.0.0:{port}")
-    app.run(host='0.0.0.0', port=port, debug=False)
+    app.run(host="0.0.0.0", port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -35,5 +35,5 @@ testing/record-replay
 ### Benchmarking
-```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
+```{include} ../../../benchmarking/k8s-benchmark/README.md
 ```