mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore: move benchmarking related code (#3406)
# What does this PR do? - moving things and some formatting changes ## Test Plan
This commit is contained in:
parent
d2f88a10fb
commit
c04f1c1e8c
10 changed files with 156 additions and 149 deletions
|
@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
|
||||||
|
|
||||||
**1. Deploy base k8s infrastructure:**
|
**1. Deploy base k8s infrastructure:**
|
||||||
```bash
|
```bash
|
||||||
cd ../k8s
|
cd ../../docs/source/distributions/k8s
|
||||||
./apply.sh
|
./apply.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
**2. Deploy benchmark components:**
|
**2. Deploy benchmark components:**
|
||||||
```bash
|
```bash
|
||||||
cd ../k8s-benchmark
|
|
||||||
./apply.sh
|
./apply.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -56,7 +55,6 @@ kubectl get pods
|
||||||
|
|
||||||
**Benchmark Llama Stack (default):**
|
**Benchmark Llama Stack (default):**
|
||||||
```bash
|
```bash
|
||||||
cd docs/source/distributions/k8s-benchmark/
|
|
||||||
./run-benchmark.sh
|
./run-benchmark.sh
|
||||||
```
|
```
|
||||||
|
|
|
@ -14,7 +14,7 @@ import os
|
||||||
import random
|
import random
|
||||||
import statistics
|
import statistics
|
||||||
import time
|
import time
|
||||||
from typing import Tuple
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,10 +56,10 @@ class BenchmarkStats:
|
||||||
total_time = self.end_time - self.start_time
|
total_time = self.end_time - self.start_time
|
||||||
success_rate = (self.success_count / self.total_requests) * 100
|
success_rate = (self.success_count / self.total_requests) * 100
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'=' * 60}")
|
||||||
print(f"BENCHMARK RESULTS")
|
print("BENCHMARK RESULTS")
|
||||||
|
|
||||||
print(f"\nResponse Time Statistics:")
|
print("\nResponse Time Statistics:")
|
||||||
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
||||||
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
||||||
print(f" Min: {min(self.response_times):.3f}s")
|
print(f" Min: {min(self.response_times):.3f}s")
|
||||||
|
@ -70,14 +70,14 @@ class BenchmarkStats:
|
||||||
|
|
||||||
percentiles = [50, 90, 95, 99]
|
percentiles = [50, 90, 95, 99]
|
||||||
sorted_times = sorted(self.response_times)
|
sorted_times = sorted(self.response_times)
|
||||||
print(f"\nPercentiles:")
|
print("\nPercentiles:")
|
||||||
for p in percentiles:
|
for p in percentiles:
|
||||||
idx = int(len(sorted_times) * p / 100) - 1
|
idx = int(len(sorted_times) * p / 100) - 1
|
||||||
idx = max(0, min(idx, len(sorted_times) - 1))
|
idx = max(0, min(idx, len(sorted_times) - 1))
|
||||||
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
||||||
|
|
||||||
if self.ttft_times:
|
if self.ttft_times:
|
||||||
print(f"\nTime to First Token (TTFT) Statistics:")
|
print("\nTime to First Token (TTFT) Statistics:")
|
||||||
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
||||||
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
||||||
print(f" Min: {min(self.ttft_times):.3f}s")
|
print(f" Min: {min(self.ttft_times):.3f}s")
|
||||||
|
@ -87,18 +87,18 @@ class BenchmarkStats:
|
||||||
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
||||||
|
|
||||||
sorted_ttft = sorted(self.ttft_times)
|
sorted_ttft = sorted(self.ttft_times)
|
||||||
print(f"\nTTFT Percentiles:")
|
print("\nTTFT Percentiles:")
|
||||||
for p in percentiles:
|
for p in percentiles:
|
||||||
idx = int(len(sorted_ttft) * p / 100) - 1
|
idx = int(len(sorted_ttft) * p / 100) - 1
|
||||||
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
||||||
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
||||||
|
|
||||||
if self.chunks_received:
|
if self.chunks_received:
|
||||||
print(f"\nStreaming Statistics:")
|
print("\nStreaming Statistics:")
|
||||||
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
||||||
print(f" Total chunks received: {sum(self.chunks_received)}")
|
print(f" Total chunks received: {sum(self.chunks_received)}")
|
||||||
|
|
||||||
print(f"{'='*60}")
|
print(f"{'=' * 60}")
|
||||||
print(f"Total time: {total_time:.2f}s")
|
print(f"Total time: {total_time:.2f}s")
|
||||||
print(f"Concurrent users: {self.concurrent_users}")
|
print(f"Concurrent users: {self.concurrent_users}")
|
||||||
print(f"Total requests: {self.total_requests}")
|
print(f"Total requests: {self.total_requests}")
|
||||||
|
@ -108,14 +108,14 @@ class BenchmarkStats:
|
||||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||||
|
|
||||||
if self.errors:
|
if self.errors:
|
||||||
print(f"\nErrors (showing first 5):")
|
print("\nErrors (showing first 5):")
|
||||||
for error in self.errors[:5]:
|
for error in self.errors[:5]:
|
||||||
print(f" {error}")
|
print(f" {error}")
|
||||||
|
|
||||||
|
|
||||||
class LlamaStackBenchmark:
|
class LlamaStackBenchmark:
|
||||||
def __init__(self, base_url: str, model_id: str):
|
def __init__(self, base_url: str, model_id: str):
|
||||||
self.base_url = base_url.rstrip('/')
|
self.base_url = base_url.rstrip("/")
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
self.headers = {"Content-Type": "application/json"}
|
self.headers = {"Content-Type": "application/json"}
|
||||||
self.test_messages = [
|
self.test_messages = [
|
||||||
|
@ -126,20 +126,14 @@ class LlamaStackBenchmark:
|
||||||
[
|
[
|
||||||
{"role": "user", "content": "What is machine learning?"},
|
{"role": "user", "content": "What is machine learning?"},
|
||||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||||
{"role": "user", "content": "Can you give me a practical example?"}
|
{"role": "user", "content": "Can you give me a practical example?"},
|
||||||
]
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
|
||||||
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
|
||||||
"""Make a single async streaming chat completion request."""
|
"""Make a single async streaming chat completion request."""
|
||||||
messages = random.choice(self.test_messages)
|
messages = random.choice(self.test_messages)
|
||||||
payload = {
|
payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
|
||||||
"model": self.model_id,
|
|
||||||
"messages": messages,
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 100
|
|
||||||
}
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
chunks_received = 0
|
chunks_received = 0
|
||||||
|
@ -153,17 +147,17 @@ class LlamaStackBenchmark:
|
||||||
f"{self.base_url}/chat/completions",
|
f"{self.base_url}/chat/completions",
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
json=payload,
|
json=payload,
|
||||||
timeout=aiohttp.ClientTimeout(total=30)
|
timeout=aiohttp.ClientTimeout(total=30),
|
||||||
) as response:
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for line in response.content:
|
async for line in response.content:
|
||||||
if line:
|
if line:
|
||||||
line_str = line.decode('utf-8').strip()
|
line_str = line.decode("utf-8").strip()
|
||||||
if line_str.startswith('data: '):
|
if line_str.startswith("data: "):
|
||||||
chunks_received += 1
|
chunks_received += 1
|
||||||
if ttft is None:
|
if ttft is None:
|
||||||
ttft = time.time() - start_time
|
ttft = time.time() - start_time
|
||||||
if line_str == 'data: [DONE]':
|
if line_str == "data: [DONE]":
|
||||||
break
|
break
|
||||||
|
|
||||||
if chunks_received == 0:
|
if chunks_received == 0:
|
||||||
|
@ -180,7 +174,6 @@ class LlamaStackBenchmark:
|
||||||
response_time = time.time() - start_time
|
response_time = time.time() - start_time
|
||||||
return response_time, chunks_received, ttft, error
|
return response_time, chunks_received, ttft, error
|
||||||
|
|
||||||
|
|
||||||
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
||||||
"""Run benchmark using async requests for specified duration."""
|
"""Run benchmark using async requests for specified duration."""
|
||||||
stats = BenchmarkStats()
|
stats = BenchmarkStats()
|
||||||
|
@ -192,7 +185,7 @@ class LlamaStackBenchmark:
|
||||||
print(f"Model: {self.model_id}")
|
print(f"Model: {self.model_id}")
|
||||||
|
|
||||||
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
||||||
async with aiohttp.ClientSession(connector=connector) as session:
|
async with aiohttp.ClientSession(connector=connector):
|
||||||
|
|
||||||
async def worker(worker_id: int):
|
async def worker(worker_id: int):
|
||||||
"""Worker that sends requests sequentially until canceled."""
|
"""Worker that sends requests sequentially until canceled."""
|
||||||
|
@ -216,7 +209,9 @@ class LlamaStackBenchmark:
|
||||||
await asyncio.sleep(1) # Report every second
|
await asyncio.sleep(1) # Report every second
|
||||||
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
||||||
elapsed = time.time() - stats.start_time
|
elapsed = time.time() - stats.start_time
|
||||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
|
print(
|
||||||
|
f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
|
||||||
|
)
|
||||||
last_report_time = time.time()
|
last_report_time = time.time()
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
break
|
break
|
||||||
|
@ -241,14 +236,16 @@ class LlamaStackBenchmark:
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
||||||
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
parser.add_argument(
|
||||||
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
"--base-url",
|
||||||
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
||||||
help="Model ID to use for requests")
|
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
|
||||||
parser.add_argument("--duration", type=int, default=60,
|
)
|
||||||
help="Duration in seconds to run benchmark (default: 60)")
|
parser.add_argument(
|
||||||
parser.add_argument("--concurrent", type=int, default=10,
|
"--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
|
||||||
help="Number of concurrent users (default: 10)")
|
)
|
||||||
|
parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
|
||||||
|
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
|
@ -11,16 +11,18 @@ OpenAI-compatible mock server that returns:
|
||||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
- Valid OpenAI-formatted chat completion responses with dynamic content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from flask import Flask, request, jsonify, Response
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
import uuid
|
|
||||||
import json
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from flask import Flask, Response, jsonify, request
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Models from environment variables
|
# Models from environment variables
|
||||||
def get_models():
|
def get_models():
|
||||||
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
||||||
|
@ -29,40 +31,72 @@ def get_models():
|
||||||
return {
|
return {
|
||||||
"object": "list",
|
"object": "list",
|
||||||
"data": [
|
"data": [
|
||||||
{
|
{"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
|
||||||
"id": model_id,
|
],
|
||||||
"object": "model",
|
|
||||||
"created": 1234567890,
|
|
||||||
"owned_by": "vllm"
|
|
||||||
}
|
|
||||||
for model_id in model_ids
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generate_random_text(length=50):
|
def generate_random_text(length=50):
|
||||||
"""Generate random but coherent text for responses."""
|
"""Generate random but coherent text for responses."""
|
||||||
words = [
|
words = [
|
||||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
"Hello",
|
||||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
"there",
|
||||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
"I'm",
|
||||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
"an",
|
||||||
|
"AI",
|
||||||
|
"assistant",
|
||||||
|
"ready",
|
||||||
|
"to",
|
||||||
|
"help",
|
||||||
|
"you",
|
||||||
|
"with",
|
||||||
|
"your",
|
||||||
|
"questions",
|
||||||
|
"and",
|
||||||
|
"tasks",
|
||||||
|
"today",
|
||||||
|
"Let",
|
||||||
|
"me",
|
||||||
|
"know",
|
||||||
|
"what",
|
||||||
|
"you'd",
|
||||||
|
"like",
|
||||||
|
"to",
|
||||||
|
"discuss",
|
||||||
|
"or",
|
||||||
|
"explore",
|
||||||
|
"together",
|
||||||
|
"I",
|
||||||
|
"can",
|
||||||
|
"assist",
|
||||||
|
"with",
|
||||||
|
"various",
|
||||||
|
"topics",
|
||||||
|
"including",
|
||||||
|
"coding",
|
||||||
|
"writing",
|
||||||
|
"analysis",
|
||||||
|
"and",
|
||||||
|
"more",
|
||||||
]
|
]
|
||||||
return " ".join(random.choices(words, k=length))
|
return " ".join(random.choices(words, k=length))
|
||||||
|
|
||||||
@app.route('/v1/models', methods=['GET'])
|
|
||||||
|
@app.route("/v1/models", methods=["GET"])
|
||||||
def list_models():
|
def list_models():
|
||||||
models = get_models()
|
models = get_models()
|
||||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||||
return jsonify(models)
|
return jsonify(models)
|
||||||
|
|
||||||
@app.route('/v1/chat/completions', methods=['POST'])
|
|
||||||
|
@app.route("/v1/chat/completions", methods=["POST"])
|
||||||
def chat_completions():
|
def chat_completions():
|
||||||
"""Return OpenAI-formatted chat completion responses."""
|
"""Return OpenAI-formatted chat completion responses."""
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
default_model = get_models()['data'][0]['id']
|
default_model = get_models()["data"][0]["id"]
|
||||||
model = data.get('model', default_model)
|
model = data.get("model", default_model)
|
||||||
messages = data.get('messages', [])
|
messages = data.get("messages", [])
|
||||||
stream = data.get('stream', False)
|
stream = data.get("stream", False)
|
||||||
|
|
||||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
||||||
|
|
||||||
|
@ -71,11 +105,12 @@ def chat_completions():
|
||||||
else:
|
else:
|
||||||
return handle_non_streaming_completion(model, messages)
|
return handle_non_streaming_completion(model, messages)
|
||||||
|
|
||||||
|
|
||||||
def handle_non_streaming_completion(model, messages):
|
def handle_non_streaming_completion(model, messages):
|
||||||
response_text = generate_random_text(random.randint(20, 80))
|
response_text = generate_random_text(random.randint(20, 80))
|
||||||
|
|
||||||
# Calculate realistic token counts
|
# Calculate realistic token counts
|
||||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
|
||||||
completion_tokens = len(response_text.split())
|
completion_tokens = len(response_text.split())
|
||||||
|
|
||||||
response = {
|
response = {
|
||||||
|
@ -83,25 +118,17 @@ def handle_non_streaming_completion(model, messages):
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"created": int(time.time()),
|
"created": int(time.time()),
|
||||||
"model": model,
|
"model": model,
|
||||||
"choices": [
|
"choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": response_text
|
|
||||||
},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
"usage": {
|
||||||
"prompt_tokens": prompt_tokens,
|
"prompt_tokens": prompt_tokens,
|
||||||
"completion_tokens": completion_tokens,
|
"completion_tokens": completion_tokens,
|
||||||
"total_tokens": prompt_tokens + completion_tokens
|
"total_tokens": prompt_tokens + completion_tokens,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
return jsonify(response)
|
return jsonify(response)
|
||||||
|
|
||||||
|
|
||||||
def handle_streaming_completion(model, messages):
|
def handle_streaming_completion(model, messages):
|
||||||
def generate_stream():
|
def generate_stream():
|
||||||
# Generate response text
|
# Generate response text
|
||||||
|
@ -114,12 +141,7 @@ def handle_streaming_completion(model, messages):
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
"created": int(time.time()),
|
"created": int(time.time()),
|
||||||
"model": model,
|
"model": model,
|
||||||
"choices": [
|
"choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"role": "assistant", "content": ""}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
||||||
|
|
||||||
|
@ -130,12 +152,7 @@ def handle_streaming_completion(model, messages):
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
"created": int(time.time()),
|
"created": int(time.time()),
|
||||||
"model": model,
|
"model": model,
|
||||||
"choices": [
|
"choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
yield f"data: {json.dumps(chunk)}\n\n"
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
# Configurable delay to simulate realistic streaming
|
# Configurable delay to simulate realistic streaming
|
||||||
|
@ -148,35 +165,30 @@ def handle_streaming_completion(model, messages):
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
"created": int(time.time()),
|
"created": int(time.time()),
|
||||||
"model": model,
|
"model": model,
|
||||||
"choices": [
|
"choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"content": ""},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
yield f"data: {json.dumps(final_chunk)}\n\n"
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
return Response(
|
return Response(
|
||||||
generate_stream(),
|
generate_stream(),
|
||||||
mimetype='text/event-stream',
|
mimetype="text/event-stream",
|
||||||
headers={
|
headers={
|
||||||
'Cache-Control': 'no-cache',
|
"Cache-Control": "no-cache",
|
||||||
'Connection': 'keep-alive',
|
"Connection": "keep-alive",
|
||||||
'Access-Control-Allow-Origin': '*',
|
"Access-Control-Allow-Origin": "*",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@app.route('/health', methods=['GET'])
|
|
||||||
|
@app.route("/health", methods=["GET"])
|
||||||
def health():
|
def health():
|
||||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
return jsonify({"status": "healthy", "type": "openai-mock"})
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
if __name__ == "__main__":
|
||||||
parser.add_argument('--port', type=int, default=8081,
|
parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
|
||||||
help='Port to run the server on (default: 8081)')
|
parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
port = args.port
|
port = args.port
|
||||||
|
@ -187,4 +199,4 @@ if __name__ == '__main__':
|
||||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
||||||
print("- Streaming support with valid SSE format")
|
print("- Streaming support with valid SSE format")
|
||||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
print(f"- Listening on: http://0.0.0.0:{port}")
|
||||||
app.run(host='0.0.0.0', port=port, debug=False)
|
app.run(host="0.0.0.0", port=port, debug=False)
|
|
@ -35,5 +35,5 @@ testing/record-replay
|
||||||
|
|
||||||
### Benchmarking
|
### Benchmarking
|
||||||
|
|
||||||
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
|
```{include} ../../../benchmarking/k8s-benchmark/README.md
|
||||||
```
|
```
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue