openai mock server, Split of "k8s bench, locust"

# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
2025-12-23 03:12:25 +00:00 · 2025-08-11 16:34:50 -07:00 · 2025-08-11 16:34:50 -07:00 · 9db924adae
commit 9db924adae
parent 78a59a4dbe
11 changed files with 1234 additions and 3 deletions
--- a/docs/source/distributions/k8s-benchmark/locustfile.py
+++ b/docs/source/distributions/k8s-benchmark/locustfile.py
@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Locust load testing script for Llama Stack with Prism mock OpenAI provider.
+"""
+
+import random
+from locust import HttpUser, task, between
+import os
+
+base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
+
+MODEL_ID = os.getenv("INFERENCE_MODEL")
+
+class LlamaStackUser(HttpUser):
+    wait_time = between(0.0, 0.0001)
+    
+    def on_start(self):
+        """Setup authentication and test data."""
+        # No auth required for benchmark server
+        self.headers = {
+            "Content-Type": "application/json"
+        }
+        
+        # Test messages of varying lengths
+        self.test_messages = [
+            [{"role": "user", "content": "Hi"}],
+            [{"role": "user", "content": "What is the capital of France?"}],
+            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
+            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
+            [
+                {"role": "user", "content": "What is machine learning?"},
+                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
+                {"role": "user", "content": "Can you give me a practical example?"}
+            ]
+        ]
+
+    @task(weight=100)
+    def chat_completion_streaming(self):
+        """Test streaming chat completion (20% of requests)."""
+        messages = random.choice(self.test_messages)
+        payload = {
+            "model": MODEL_ID, 
+            "messages": messages,
+            "stream": True,
+            "max_tokens": 100
+        }
+        
+        with self.client.post(
+            f"{base_path}/chat/completions",
+            headers=self.headers,
+            json=payload,
+            stream=True,
+            catch_response=True
+        ) as response:
+            if response.status_code == 200:
+                chunks_received = 0
+                try:
+                    for line in response.iter_lines():
+                        if line:
+                            line_str = line.decode('utf-8')
+                            if line_str.startswith('data: '):
+                                chunks_received += 1
+                                if line_str.strip() == 'data: [DONE]':
+                                    break
+                    
+                    if chunks_received > 0:
+                        response.success()
+                    else:
+                        response.failure("No streaming chunks received")
+                except Exception as e:
+                    response.failure(f"Streaming error: {e}")
+            else:
+                response.failure(f"HTTP {response.status_code}: {response.text}")