diff --git a/docs/source/distributions/k8s-benchmark/README.md b/docs/source/distributions/k8s-benchmark/README.md new file mode 100644 index 000000000..94cac817e --- /dev/null +++ b/docs/source/distributions/k8s-benchmark/README.md @@ -0,0 +1,128 @@ +# Kubernetes Benchmark Suite for Llama Stack + +Benchmark performance between Llama Stack and vLLM direct inference on Kubernetes. + +## Setup + +**1. Deploy base k8s infrastructure:** +```bash +cd ../k8s +./apply.sh +``` + +**2. Deploy benchmark components:** +```bash +cd ../k8s-benchmark +./apply.sh +``` + +**3. Verify deployment:** +```bash +kubectl get pods +# Should see: llama-stack-benchmark-server, vllm-server, etc. +``` + +## Quick Start + +### Basic Benchmarks + +**Benchmark Llama Stack (default):** +```bash +cd docs/source/distributions/k8s-benchmark/ +./run-benchmark.sh +``` + +**Benchmark vLLM direct:** +```bash +./run-benchmark.sh --target vllm +``` + +### Custom Configuration + +**Extended benchmark with high concurrency:** +```bash +./run-benchmark.sh --target vllm --duration 120 --concurrent 20 +``` + +**Short test run:** +```bash +./run-benchmark.sh --target stack --duration 30 --concurrent 5 +``` + +## Command Reference + +### run-benchmark.sh Options + +```bash +./run-benchmark.sh [options] + +Options: + -t, --target Target to benchmark (default: stack) + -d, --duration Duration in seconds (default: 60) + -c, --concurrent Number of concurrent users (default: 10) + -h, --help Show help message + +Examples: + ./run-benchmark.sh --target vllm # Benchmark vLLM direct + ./run-benchmark.sh --target stack # Benchmark Llama Stack + ./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users +``` + +## Local Testing + +### Running Benchmark Locally + +For local development without Kubernetes: + +**1. Start OpenAI mock server:** +```bash +uv run python openai-mock-server.py --port 8080 +``` + +**2. Run benchmark against mock server:** +```bash +uv run python benchmark.py \ + --base-url http://localhost:8080/v1 \ + --model mock-inference \ + --duration 30 \ + --concurrent 5 +``` + +**3. Test against local vLLM server:** +```bash +# If you have vLLM running locally on port 8000 +uv run python benchmark.py \ + --base-url http://localhost:8000/v1 \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --duration 30 \ + --concurrent 5 +``` + +**4. Profile the running server:** +```bash +./profile_running_server.sh +``` + + + +### OpenAI Mock Server + +The `openai-mock-server.py` provides: +- **OpenAI-compatible API** for testing without real models +- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var +- **Consistent responses** for reproducible benchmarks +- **Lightweight testing** without GPU requirements + +**Mock server usage:** +```bash +uv run python openai-mock-server.py --port 8080 +``` + +The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider. + +## Files in this Directory + +- `benchmark.py` - Core benchmark script with async streaming support +- `run-benchmark.sh` - Main script with target selection and configuration +- `openai-mock-server.py` - Mock OpenAI API server for local testing +- `README.md` - This documentation file diff --git a/docs/source/distributions/k8s-benchmark/apply.sh b/docs/source/distributions/k8s-benchmark/apply.sh index 119a1c849..4f2270da8 100755 --- a/docs/source/distributions/k8s-benchmark/apply.sh +++ b/docs/source/distributions/k8s-benchmark/apply.sh @@ -8,7 +8,6 @@ # Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh). -export MOCK_INFERENCE_PORT=8080 export STREAM_DELAY_SECONDS=0.005 export POSTGRES_USER=llamastack @@ -20,14 +19,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B export MOCK_INFERENCE_MODEL=mock-inference -# Use llama-stack-benchmark-service as the benchmark server -export LOCUST_HOST=http://llama-stack-benchmark-service:8323 -export LOCUST_BASE_PATH=/v1/openai/v1 - -# Use vllm-service as the benchmark server -# export LOCUST_HOST=http://vllm-server:8000 -# export LOCUST_BASE_PATH=/v1 - +export MOCK_INFERENCE_URL=openai-mock-service:8080 export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL @@ -35,13 +27,6 @@ set -euo pipefail set -x # Deploy benchmark-specific components -# Deploy OpenAI mock server -kubectl create configmap openai-mock --from-file=openai-mock-server.py \ - --dry-run=client -o yaml | kubectl apply --validate=false -f - - -envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f - - -# Create configmap with our custom stack config kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ --dry-run=client -o yaml > stack-configmap.yaml @@ -49,9 +34,3 @@ kubectl apply --validate=false -f stack-configmap.yaml # Deploy our custom llama stack server (overriding the base one) envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f - - -# Deploy Locust load testing -kubectl create configmap locust-script --from-file=locustfile.py \ - --dry-run=client -o yaml | kubectl apply --validate=false -f - - -envsubst < locust-k8s.yaml | kubectl apply --validate=false -f - diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/docs/source/distributions/k8s-benchmark/benchmark.py new file mode 100644 index 000000000..0e7368431 --- /dev/null +++ b/docs/source/distributions/k8s-benchmark/benchmark.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +""" +Simple benchmark script for Llama Stack with OpenAI API compatibility. +""" + +import argparse +import asyncio +import os +import random +import statistics +import time +from typing import Tuple +import aiohttp + + +class BenchmarkStats: + def __init__(self): + self.response_times = [] + self.ttft_times = [] + self.chunks_received = [] + self.errors = [] + self.success_count = 0 + self.total_requests = 0 + self.concurrent_users = 0 + self.start_time = None + self.end_time = None + self._lock = asyncio.Lock() + + async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None): + async with self._lock: + self.total_requests += 1 + if error: + self.errors.append(error) + else: + self.success_count += 1 + self.response_times.append(response_time) + self.chunks_received.append(chunks) + if ttft is not None: + self.ttft_times.append(ttft) + + def print_summary(self): + if not self.response_times: + print("No successful requests to report") + if self.errors: + print(f"Total errors: {len(self.errors)}") + print("First 5 errors:") + for error in self.errors[:5]: + print(f" {error}") + return + + total_time = self.end_time - self.start_time + success_rate = (self.success_count / self.total_requests) * 100 + + print(f"\n{'='*60}") + print(f"BENCHMARK RESULTS") + print(f"{'='*60}") + print(f"Total time: {total_time:.2f}s") + print(f"Concurrent users: {self.concurrent_users}") + print(f"Total requests: {self.total_requests}") + print(f"Successful requests: {self.success_count}") + print(f"Failed requests: {len(self.errors)}") + print(f"Success rate: {success_rate:.1f}%") + print(f"Requests per second: {self.success_count / total_time:.2f}") + + print(f"\nResponse Time Statistics:") + print(f" Mean: {statistics.mean(self.response_times):.3f}s") + print(f" Median: {statistics.median(self.response_times):.3f}s") + print(f" Min: {min(self.response_times):.3f}s") + print(f" Max: {max(self.response_times):.3f}s") + + if len(self.response_times) > 1: + print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s") + + percentiles = [50, 90, 95, 99] + sorted_times = sorted(self.response_times) + print(f"\nPercentiles:") + for p in percentiles: + idx = int(len(sorted_times) * p / 100) - 1 + idx = max(0, min(idx, len(sorted_times) - 1)) + print(f" P{p}: {sorted_times[idx]:.3f}s") + + if self.ttft_times: + print(f"\nTime to First Token (TTFT) Statistics:") + print(f" Mean: {statistics.mean(self.ttft_times):.3f}s") + print(f" Median: {statistics.median(self.ttft_times):.3f}s") + print(f" Min: {min(self.ttft_times):.3f}s") + print(f" Max: {max(self.ttft_times):.3f}s") + + if len(self.ttft_times) > 1: + print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s") + + sorted_ttft = sorted(self.ttft_times) + print(f"\nTTFT Percentiles:") + for p in percentiles: + idx = int(len(sorted_ttft) * p / 100) - 1 + idx = max(0, min(idx, len(sorted_ttft) - 1)) + print(f" P{p}: {sorted_ttft[idx]:.3f}s") + + if self.chunks_received: + print(f"\nStreaming Statistics:") + print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}") + print(f" Total chunks received: {sum(self.chunks_received)}") + + if self.errors: + print(f"\nErrors (showing first 5):") + for error in self.errors[:5]: + print(f" {error}") + + +class LlamaStackBenchmark: + def __init__(self, base_url: str, model_id: str): + self.base_url = base_url.rstrip('/') + self.model_id = model_id + self.headers = {"Content-Type": "application/json"} + self.test_messages = [ + [{"role": "user", "content": "Hi"}], + [{"role": "user", "content": "What is the capital of France?"}], + [{"role": "user", "content": "Explain quantum physics in simple terms."}], + [{"role": "user", "content": "Write a short story about a robot learning to paint."}], + [ + {"role": "user", "content": "What is machine learning?"}, + {"role": "assistant", "content": "Machine learning is a subset of AI..."}, + {"role": "user", "content": "Can you give me a practical example?"} + ] + ] + + + async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]: + """Make a single async streaming chat completion request.""" + messages = random.choice(self.test_messages) + payload = { + "model": self.model_id, + "messages": messages, + "stream": True, + "max_tokens": 100 + } + + start_time = time.time() + chunks_received = 0 + ttft = None + error = None + + session = aiohttp.ClientSession() + + try: + async with session.post( + f"{self.base_url}/chat/completions", + headers=self.headers, + json=payload, + timeout=aiohttp.ClientTimeout(total=30) + ) as response: + if response.status == 200: + async for line in response.content: + if line: + line_str = line.decode('utf-8').strip() + if line_str.startswith('data: '): + chunks_received += 1 + if ttft is None: + ttft = time.time() - start_time + if line_str == 'data: [DONE]': + break + + if chunks_received == 0: + error = "No streaming chunks received" + else: + text = await response.text() + error = f"HTTP {response.status}: {text[:100]}" + + except Exception as e: + error = f"Request error: {str(e)}" + finally: + await session.close() + + response_time = time.time() - start_time + return response_time, chunks_received, ttft, error + + + async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats: + """Run benchmark using async requests for specified duration.""" + stats = BenchmarkStats() + stats.concurrent_users = concurrent_users + stats.start_time = time.time() + + print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users") + print(f"Target URL: {self.base_url}/chat/completions") + print(f"Model: {self.model_id}") + + connector = aiohttp.TCPConnector(limit=concurrent_users) + async with aiohttp.ClientSession(connector=connector) as session: + + async def worker(worker_id: int): + """Worker that sends requests sequentially until canceled.""" + request_count = 0 + while True: + try: + response_time, chunks, ttft, error = await self.make_async_streaming_request() + await stats.add_result(response_time, chunks, ttft, error) + request_count += 1 + + except asyncio.CancelledError: + break + except Exception as e: + await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}") + + # Progress reporting task + async def progress_reporter(): + last_report_time = time.time() + while True: + try: + await asyncio.sleep(1) # Report every second + if time.time() >= last_report_time + 10: # Report every 10 seconds + elapsed = time.time() - stats.start_time + print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s") + last_report_time = time.time() + except asyncio.CancelledError: + break + + # Spawn concurrent workers + tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)] + progress_task = asyncio.create_task(progress_reporter()) + tasks.append(progress_task) + + # Wait for duration then cancel all tasks + await asyncio.sleep(duration) + + for task in tasks: + task.cancel() + + # Wait for all tasks to complete + await asyncio.gather(*tasks, return_exceptions=True) + + stats.end_time = time.time() + return stats + + +def main(): + parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool") + parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"), + help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)") + parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"), + help="Model ID to use for requests") + parser.add_argument("--duration", type=int, default=60, + help="Duration in seconds to run benchmark (default: 60)") + parser.add_argument("--concurrent", type=int, default=10, + help="Number of concurrent users (default: 10)") + + args = parser.parse_args() + + benchmark = LlamaStackBenchmark(args.base_url, args.model) + + try: + stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent)) + stats.print_summary() + + except KeyboardInterrupt: + print("\nBenchmark interrupted by user") + except Exception as e: + print(f"Benchmark failed: {e}") + + +if __name__ == "__main__": + main() diff --git a/docs/source/distributions/k8s-benchmark/locust-k8s.yaml b/docs/source/distributions/k8s-benchmark/locust-k8s.yaml deleted file mode 100644 index f20a01b2d..000000000 --- a/docs/source/distributions/k8s-benchmark/locust-k8s.yaml +++ /dev/null @@ -1,131 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: locust-master - labels: - app: locust - role: master -spec: - replicas: 1 - selector: - matchLabels: - app: locust - role: master - template: - metadata: - labels: - app: locust - role: master - spec: - containers: - - name: locust-master - image: locustio/locust:2.31.8 - ports: - - containerPort: 8089 # Web UI - - containerPort: 5557 # Master communication - env: - - name: LOCUST_HOST - value: "${LOCUST_HOST}" - - name: LOCUST_LOCUSTFILE - value: "/locust/locustfile.py" - - name: LOCUST_WEB_HOST - value: "0.0.0.0" - - name: LOCUST_MASTER - value: "true" - - name: LOCUST_BASE_PATH - value: "${LOCUST_BASE_PATH}" - - name: INFERENCE_MODEL - value: "${BENCHMARK_INFERENCE_MODEL}" - volumeMounts: - - name: locust-script - mountPath: /locust - command: ["locust"] - args: - - "--master" - - "--web-host=0.0.0.0" - - "--web-port=8089" - - "--host=${LOCUST_HOST}" - - "--locustfile=/locust/locustfile.py" - volumes: - - name: locust-script - configMap: - name: locust-script ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: locust-worker - labels: - app: locust - role: worker -spec: - replicas: 2 # Start with 2 workers, can be scaled up - selector: - matchLabels: - app: locust - role: worker - template: - metadata: - labels: - app: locust - role: worker - spec: - containers: - - name: locust-worker - image: locustio/locust:2.31.8 - env: - - name: LOCUST_HOST - value: "${LOCUST_HOST}" - - name: LOCUST_LOCUSTFILE - value: "/locust/locustfile.py" - - name: LOCUST_MASTER_HOST - value: "locust-master-service" - - name: LOCUST_MASTER_PORT - value: "5557" - - name: INFERENCE_MODEL - value: "${BENCHMARK_INFERENCE_MODEL}" - - name: LOCUST_BASE_PATH - value: "${LOCUST_BASE_PATH}" - volumeMounts: - - name: locust-script - mountPath: /locust - command: ["locust"] - args: - - "--worker" - - "--master-host=locust-master-service" - - "--master-port=5557" - - "--locustfile=/locust/locustfile.py" - volumes: - - name: locust-script - configMap: - name: locust-script ---- -apiVersion: v1 -kind: Service -metadata: - name: locust-master-service -spec: - selector: - app: locust - role: master - ports: - - name: web-ui - port: 8089 - targetPort: 8089 - - name: master-comm - port: 5557 - targetPort: 5557 - type: ClusterIP ---- -apiVersion: v1 -kind: Service -metadata: - name: locust-web-ui -spec: - selector: - app: locust - role: master - ports: - - port: 8089 - targetPort: 8089 - type: ClusterIP # Keep internal, use port-forward to access diff --git a/docs/source/distributions/k8s-benchmark/locustfile.py b/docs/source/distributions/k8s-benchmark/locustfile.py deleted file mode 100644 index 8e511fa95..000000000 --- a/docs/source/distributions/k8s-benchmark/locustfile.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -""" -Locust load testing script for Llama Stack with Prism mock OpenAI provider. -""" - -import random -from locust import HttpUser, task, between -import os - -base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1") - -MODEL_ID = os.getenv("INFERENCE_MODEL") - -class LlamaStackUser(HttpUser): - wait_time = between(0.0, 0.0001) - - def on_start(self): - """Setup authentication and test data.""" - # No auth required for benchmark server - self.headers = { - "Content-Type": "application/json" - } - - # Test messages of varying lengths - self.test_messages = [ - [{"role": "user", "content": "Hi"}], - [{"role": "user", "content": "What is the capital of France?"}], - [{"role": "user", "content": "Explain quantum physics in simple terms."}], - [{"role": "user", "content": "Write a short story about a robot learning to paint."}], - [ - {"role": "user", "content": "What is machine learning?"}, - {"role": "assistant", "content": "Machine learning is a subset of AI..."}, - {"role": "user", "content": "Can you give me a practical example?"} - ] - ] - - @task(weight=100) - def chat_completion_streaming(self): - """Test streaming chat completion (20% of requests).""" - messages = random.choice(self.test_messages) - payload = { - "model": MODEL_ID, - "messages": messages, - "stream": True, - "max_tokens": 100 - } - - with self.client.post( - f"{base_path}/chat/completions", - headers=self.headers, - json=payload, - stream=True, - catch_response=True - ) as response: - if response.status_code == 200: - chunks_received = 0 - try: - for line in response.iter_lines(): - if line: - line_str = line.decode('utf-8') - if line_str.startswith('data: '): - chunks_received += 1 - if line_str.strip() == 'data: [DONE]': - break - - if chunks_received > 0: - response.success() - else: - response.failure("No streaming chunks received") - except Exception as e: - response.failure(f"Streaming error: {e}") - else: - response.failure(f"HTTP {response.status_code}: {response.text}") diff --git a/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml b/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml deleted file mode 100644 index c72921281..000000000 --- a/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml +++ /dev/null @@ -1,52 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: openai-mock - labels: - app: openai-mock -spec: - replicas: 1 - selector: - matchLabels: - app: openai-mock - template: - metadata: - labels: - app: openai-mock - spec: - containers: - - name: openai-mock - image: python:3.12-slim - ports: - - containerPort: ${MOCK_INFERENCE_PORT} - env: - - name: PORT - value: "${MOCK_INFERENCE_PORT}" - - name: MOCK_MODELS - value: "${MOCK_INFERENCE_MODEL}" - - name: STREAM_DELAY_SECONDS - value: "${STREAM_DELAY_SECONDS}" - command: ["sh", "-c"] - args: - - | - pip install flask && - python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT} - volumeMounts: - - name: openai-mock-script - mountPath: /app - volumes: - - name: openai-mock-script - configMap: - name: openai-mock ---- -apiVersion: v1 -kind: Service -metadata: - name: openai-mock-service -spec: - selector: - app: openai-mock - ports: - - port: 8080 - targetPort: 8080 - type: ClusterIP diff --git a/docs/source/distributions/k8s-benchmark/openai-mock-server.py b/docs/source/distributions/k8s-benchmark/openai-mock-server.py index 46c923b60..de0680842 100644 --- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py +++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py @@ -23,7 +23,7 @@ app = Flask(__name__) # Models from environment variables def get_models(): - models_str = os.getenv("MOCK_MODELS", "mock-inference") + models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct") model_ids = [m.strip() for m in models_str.split(",") if m.strip()] return { @@ -49,13 +49,13 @@ def generate_random_text(length=50): ] return " ".join(random.choices(words, k=length)) -@app.route('/models', methods=['GET']) +@app.route('/v1/models', methods=['GET']) def list_models(): models = get_models() print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}") return jsonify(models) -@app.route('/chat/completions', methods=['POST']) +@app.route('/v1/chat/completions', methods=['POST']) def chat_completions(): """Return OpenAI-formatted chat completion responses.""" data = request.get_json() diff --git a/docs/source/distributions/k8s-benchmark/profile_running_server.sh b/docs/source/distributions/k8s-benchmark/profile_running_server.sh new file mode 100755 index 000000000..65d620583 --- /dev/null +++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +# Script to profile an already running Llama Stack server +# Usage: ./profile_running_server.sh [duration_seconds] [output_file] + +DURATION=${1:-60} # Default 60 seconds +OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file + +echo "Looking for running Llama Stack server..." + +# Find the server PID +SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1) + + +if [ -z "$SERVER_PID" ]; then + echo "Error: No running Llama Stack server found" + echo "Please start your server first with:" + echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml" + exit 1 +fi + +echo "Found Llama Stack server with PID: $SERVER_PID" + +# Start py-spy profiling +echo "Starting py-spy profiling for ${DURATION} seconds..." +echo "Output will be saved to: ${OUTPUT_FILE}.svg" +echo "" +echo "You can now run your load test..." +echo "" + +# Get the full path to py-spy +PYSPY_PATH=$(which py-spy) + +# Check if running as root, if not, use sudo +if [ "$EUID" -ne 0 ]; then + echo "py-spy requires root permissions on macOS. Running with sudo..." + sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID +else + "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID +fi + +echo "" +echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg" +echo "" +echo "To view the flame graph:" +echo "open ${OUTPUT_FILE}.svg" diff --git a/docs/source/distributions/k8s-benchmark/run-benchmark.sh b/docs/source/distributions/k8s-benchmark/run-benchmark.sh new file mode 100755 index 000000000..e1c826143 --- /dev/null +++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +set -euo pipefail + +# Default values +TARGET="stack" +DURATION=60 +CONCURRENT=10 + +# Parse command line arguments +usage() { + echo "Usage: $0 [options]" + echo "Options:" + echo " -t, --target Target to benchmark (default: stack)" + echo " -d, --duration Duration in seconds (default: 60)" + echo " -c, --concurrent Number of concurrent users (default: 10)" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 --target vllm # Benchmark vLLM direct" + echo " $0 --target stack # Benchmark Llama Stack (default)" + echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users" +} + +while [[ $# -gt 0 ]]; do + case $1 in + -t|--target) + TARGET="$2" + shift 2 + ;; + -d|--duration) + DURATION="$2" + shift 2 + ;; + -c|--concurrent) + CONCURRENT="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Validate target +if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then + echo "Error: Target must be 'stack' or 'vllm'" + usage + exit 1 +fi + +# Set configuration based on target +if [[ "$TARGET" == "vllm" ]]; then + BASE_URL="http://vllm-server:8000/v1" + JOB_NAME="vllm-benchmark-job" + echo "Benchmarking vLLM direct..." +else + BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1" + JOB_NAME="stack-benchmark-job" + echo "Benchmarking Llama Stack..." +fi + +echo "Configuration:" +echo " Target: $TARGET" +echo " Base URL: $BASE_URL" +echo " Duration: ${DURATION}s" +echo " Concurrent users: $CONCURRENT" +echo "" + +# Create temporary job yaml +TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml" +cat > "$TEMP_YAML" << EOF +apiVersion: batch/v1 +kind: Job +metadata: + name: $JOB_NAME + namespace: default +spec: + template: + spec: + containers: + - name: benchmark + image: python:3.11-slim + command: ["/bin/bash"] + args: + - "-c" + - | + pip install aiohttp && + python3 /benchmark/benchmark.py \\ + --base-url $BASE_URL \\ + --model \${INFERENCE_MODEL} \\ + --duration $DURATION \\ + --concurrent $CONCURRENT + env: + - name: INFERENCE_MODEL + value: "meta-llama/Llama-3.2-3B-Instruct" + volumeMounts: + - name: benchmark-script + mountPath: /benchmark + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: benchmark-script + configMap: + name: benchmark-script + restartPolicy: Never + backoffLimit: 3 +EOF + +echo "Creating benchmark ConfigMap..." +kubectl create configmap benchmark-script \ + --from-file=benchmark.py=benchmark.py \ + --dry-run=client -o yaml | kubectl apply -f - + +echo "Cleaning up any existing benchmark job..." +kubectl delete job $JOB_NAME 2>/dev/null || true + +echo "Deploying benchmark Job..." +kubectl apply -f "$TEMP_YAML" + +echo "Waiting for job to start..." +kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s + +echo "Following benchmark logs..." +kubectl logs -f job/$JOB_NAME + +echo "Job completed. Checking final status..." +kubectl get job $JOB_NAME + +# Clean up temporary file +rm -f "$TEMP_YAML" diff --git a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml index 653e66756..edf4ebd75 100644 --- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml +++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml @@ -26,13 +26,6 @@ data: max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: mock-vllm-inference - provider_type: remote::vllm - config: - url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT} - max_tokens: 4096 - api_token: fake - tls_verify: false - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -121,9 +114,6 @@ data: - model_id: ${env.SAFETY_MODEL} provider_id: vllm-safety model_type: llm - - model_id: ${env.MOCK_INFERENCE_MODEL} - provider_id: mock-vllm-inference - model_type: llm shields: - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] diff --git a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template index bc14d5124..9cb1e5be3 100644 --- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template @@ -44,8 +44,6 @@ spec: value: "${SAFETY_MODEL}" - name: TAVILY_SEARCH_API_KEY value: "${TAVILY_SEARCH_API_KEY}" - - name: MOCK_INFERENCE_PORT - value: "${MOCK_INFERENCE_PORT}" - name: VLLM_URL value: http://vllm-server.default.svc.cluster.local:8000/v1 - name: VLLM_MAX_TOKENS @@ -54,8 +52,6 @@ spec: value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 - name: VLLM_TLS_VERIFY value: "false" - - name: MOCK_INFERENCE_MODEL - value: "${MOCK_INFERENCE_MODEL}" command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"] ports: - containerPort: 8323 diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml index ad56be047..ceb1ba2d9 100644 --- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml +++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml @@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo apis: - agents - inference -- safety - telemetry - tool_runtime - vector_io @@ -16,20 +15,6 @@ providers: max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: vllm-safety - provider_type: remote::vllm - config: - url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: mock-vllm-inference - provider_type: remote::vllm - config: - url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT} - max_tokens: 4096 - api_token: fake - tls_verify: false - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -45,11 +30,6 @@ providers: db: ${env.POSTGRES_DB:=llamastack} user: ${env.POSTGRES_USER:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack} - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -115,14 +95,6 @@ models: - model_id: ${env.INFERENCE_MODEL} provider_id: vllm-inference model_type: llm -- model_id: ${env.SAFETY_MODEL} - provider_id: vllm-safety - model_type: llm -- model_id: ${env.MOCK_INFERENCE_MODEL} - provider_id: mock-vllm-inference - model_type: llm -shields: -- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: []