mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-15 06:00:48 +00:00
benchmark, local test, ttft, duration
# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan # What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
This commit is contained in:
parent
d6ae54723d
commit
2bc1483424
12 changed files with 600 additions and 328 deletions
128
docs/source/distributions/k8s-benchmark/README.md
Normal file
128
docs/source/distributions/k8s-benchmark/README.md
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
# Kubernetes Benchmark Suite for Llama Stack
|
||||||
|
|
||||||
|
Benchmark performance between Llama Stack and vLLM direct inference on Kubernetes.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
**1. Deploy base k8s infrastructure:**
|
||||||
|
```bash
|
||||||
|
cd ../k8s
|
||||||
|
./apply.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Deploy benchmark components:**
|
||||||
|
```bash
|
||||||
|
cd ../k8s-benchmark
|
||||||
|
./apply.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Verify deployment:**
|
||||||
|
```bash
|
||||||
|
kubectl get pods
|
||||||
|
# Should see: llama-stack-benchmark-server, vllm-server, etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Basic Benchmarks
|
||||||
|
|
||||||
|
**Benchmark Llama Stack (default):**
|
||||||
|
```bash
|
||||||
|
cd docs/source/distributions/k8s-benchmark/
|
||||||
|
./run-benchmark.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benchmark vLLM direct:**
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh --target vllm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Custom Configuration
|
||||||
|
|
||||||
|
**Extended benchmark with high concurrency:**
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
|
||||||
|
```
|
||||||
|
|
||||||
|
**Short test run:**
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh --target stack --duration 30 --concurrent 5
|
||||||
|
```
|
||||||
|
|
||||||
|
## Command Reference
|
||||||
|
|
||||||
|
### run-benchmark.sh Options
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-t, --target <stack|vllm> Target to benchmark (default: stack)
|
||||||
|
-d, --duration <seconds> Duration in seconds (default: 60)
|
||||||
|
-c, --concurrent <users> Number of concurrent users (default: 10)
|
||||||
|
-h, --help Show help message
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
./run-benchmark.sh --target vllm # Benchmark vLLM direct
|
||||||
|
./run-benchmark.sh --target stack # Benchmark Llama Stack
|
||||||
|
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
|
||||||
|
```
|
||||||
|
|
||||||
|
## Local Testing
|
||||||
|
|
||||||
|
### Running Benchmark Locally
|
||||||
|
|
||||||
|
For local development without Kubernetes:
|
||||||
|
|
||||||
|
**1. Start OpenAI mock server:**
|
||||||
|
```bash
|
||||||
|
uv run python openai-mock-server.py --port 8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Run benchmark against mock server:**
|
||||||
|
```bash
|
||||||
|
uv run python benchmark.py \
|
||||||
|
--base-url http://localhost:8080/v1 \
|
||||||
|
--model mock-inference \
|
||||||
|
--duration 30 \
|
||||||
|
--concurrent 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Test against local vLLM server:**
|
||||||
|
```bash
|
||||||
|
# If you have vLLM running locally on port 8000
|
||||||
|
uv run python benchmark.py \
|
||||||
|
--base-url http://localhost:8000/v1 \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--duration 30 \
|
||||||
|
--concurrent 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**4. Profile the running server:**
|
||||||
|
```bash
|
||||||
|
./profile_running_server.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### OpenAI Mock Server
|
||||||
|
|
||||||
|
The `openai-mock-server.py` provides:
|
||||||
|
- **OpenAI-compatible API** for testing without real models
|
||||||
|
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
|
||||||
|
- **Consistent responses** for reproducible benchmarks
|
||||||
|
- **Lightweight testing** without GPU requirements
|
||||||
|
|
||||||
|
**Mock server usage:**
|
||||||
|
```bash
|
||||||
|
uv run python openai-mock-server.py --port 8080
|
||||||
|
```
|
||||||
|
|
||||||
|
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
|
||||||
|
|
||||||
|
## Files in this Directory
|
||||||
|
|
||||||
|
- `benchmark.py` - Core benchmark script with async streaming support
|
||||||
|
- `run-benchmark.sh` - Main script with target selection and configuration
|
||||||
|
- `openai-mock-server.py` - Mock OpenAI API server for local testing
|
||||||
|
- `README.md` - This documentation file
|
|
@ -8,7 +8,6 @@
|
||||||
|
|
||||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
||||||
|
|
||||||
export MOCK_INFERENCE_PORT=8080
|
|
||||||
export STREAM_DELAY_SECONDS=0.005
|
export STREAM_DELAY_SECONDS=0.005
|
||||||
|
|
||||||
export POSTGRES_USER=llamastack
|
export POSTGRES_USER=llamastack
|
||||||
|
@ -20,14 +19,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
||||||
export MOCK_INFERENCE_MODEL=mock-inference
|
export MOCK_INFERENCE_MODEL=mock-inference
|
||||||
|
|
||||||
# Use llama-stack-benchmark-service as the benchmark server
|
export MOCK_INFERENCE_URL=openai-mock-service:8080
|
||||||
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
|
|
||||||
export LOCUST_BASE_PATH=/v1/openai/v1
|
|
||||||
|
|
||||||
# Use vllm-service as the benchmark server
|
|
||||||
# export LOCUST_HOST=http://vllm-server:8000
|
|
||||||
# export LOCUST_BASE_PATH=/v1
|
|
||||||
|
|
||||||
|
|
||||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
|
|
||||||
|
@ -35,13 +27,6 @@ set -euo pipefail
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Deploy benchmark-specific components
|
# Deploy benchmark-specific components
|
||||||
# Deploy OpenAI mock server
|
|
||||||
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
|
|
||||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
||||||
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
||||||
# Create configmap with our custom stack config
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||||
--dry-run=client -o yaml > stack-configmap.yaml
|
--dry-run=client -o yaml > stack-configmap.yaml
|
||||||
|
|
||||||
|
@ -49,9 +34,3 @@ kubectl apply --validate=false -f stack-configmap.yaml
|
||||||
|
|
||||||
# Deploy our custom llama stack server (overriding the base one)
|
# Deploy our custom llama stack server (overriding the base one)
|
||||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
||||||
|
|
||||||
# Deploy Locust load testing
|
|
||||||
kubectl create configmap locust-script --from-file=locustfile.py \
|
|
||||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
||||||
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
268
docs/source/distributions/k8s-benchmark/benchmark.py
Normal file
268
docs/source/distributions/k8s-benchmark/benchmark.py
Normal file
|
@ -0,0 +1,268 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Simple benchmark script for Llama Stack with OpenAI API compatibility.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import statistics
|
||||||
|
import time
|
||||||
|
from typing import Tuple
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkStats:
|
||||||
|
def __init__(self):
|
||||||
|
self.response_times = []
|
||||||
|
self.ttft_times = []
|
||||||
|
self.chunks_received = []
|
||||||
|
self.errors = []
|
||||||
|
self.success_count = 0
|
||||||
|
self.total_requests = 0
|
||||||
|
self.concurrent_users = 0
|
||||||
|
self.start_time = None
|
||||||
|
self.end_time = None
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
|
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
|
||||||
|
async with self._lock:
|
||||||
|
self.total_requests += 1
|
||||||
|
if error:
|
||||||
|
self.errors.append(error)
|
||||||
|
else:
|
||||||
|
self.success_count += 1
|
||||||
|
self.response_times.append(response_time)
|
||||||
|
self.chunks_received.append(chunks)
|
||||||
|
if ttft is not None:
|
||||||
|
self.ttft_times.append(ttft)
|
||||||
|
|
||||||
|
def print_summary(self):
|
||||||
|
if not self.response_times:
|
||||||
|
print("No successful requests to report")
|
||||||
|
if self.errors:
|
||||||
|
print(f"Total errors: {len(self.errors)}")
|
||||||
|
print("First 5 errors:")
|
||||||
|
for error in self.errors[:5]:
|
||||||
|
print(f" {error}")
|
||||||
|
return
|
||||||
|
|
||||||
|
total_time = self.end_time - self.start_time
|
||||||
|
success_rate = (self.success_count / self.total_requests) * 100
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"BENCHMARK RESULTS")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Total time: {total_time:.2f}s")
|
||||||
|
print(f"Concurrent users: {self.concurrent_users}")
|
||||||
|
print(f"Total requests: {self.total_requests}")
|
||||||
|
print(f"Successful requests: {self.success_count}")
|
||||||
|
print(f"Failed requests: {len(self.errors)}")
|
||||||
|
print(f"Success rate: {success_rate:.1f}%")
|
||||||
|
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||||
|
|
||||||
|
print(f"\nResponse Time Statistics:")
|
||||||
|
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
||||||
|
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
||||||
|
print(f" Min: {min(self.response_times):.3f}s")
|
||||||
|
print(f" Max: {max(self.response_times):.3f}s")
|
||||||
|
|
||||||
|
if len(self.response_times) > 1:
|
||||||
|
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
|
||||||
|
|
||||||
|
percentiles = [50, 90, 95, 99]
|
||||||
|
sorted_times = sorted(self.response_times)
|
||||||
|
print(f"\nPercentiles:")
|
||||||
|
for p in percentiles:
|
||||||
|
idx = int(len(sorted_times) * p / 100) - 1
|
||||||
|
idx = max(0, min(idx, len(sorted_times) - 1))
|
||||||
|
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
||||||
|
|
||||||
|
if self.ttft_times:
|
||||||
|
print(f"\nTime to First Token (TTFT) Statistics:")
|
||||||
|
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
||||||
|
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
||||||
|
print(f" Min: {min(self.ttft_times):.3f}s")
|
||||||
|
print(f" Max: {max(self.ttft_times):.3f}s")
|
||||||
|
|
||||||
|
if len(self.ttft_times) > 1:
|
||||||
|
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
||||||
|
|
||||||
|
sorted_ttft = sorted(self.ttft_times)
|
||||||
|
print(f"\nTTFT Percentiles:")
|
||||||
|
for p in percentiles:
|
||||||
|
idx = int(len(sorted_ttft) * p / 100) - 1
|
||||||
|
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
||||||
|
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
||||||
|
|
||||||
|
if self.chunks_received:
|
||||||
|
print(f"\nStreaming Statistics:")
|
||||||
|
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
||||||
|
print(f" Total chunks received: {sum(self.chunks_received)}")
|
||||||
|
|
||||||
|
if self.errors:
|
||||||
|
print(f"\nErrors (showing first 5):")
|
||||||
|
for error in self.errors[:5]:
|
||||||
|
print(f" {error}")
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaStackBenchmark:
|
||||||
|
def __init__(self, base_url: str, model_id: str):
|
||||||
|
self.base_url = base_url.rstrip('/')
|
||||||
|
self.model_id = model_id
|
||||||
|
self.headers = {"Content-Type": "application/json"}
|
||||||
|
self.test_messages = [
|
||||||
|
[{"role": "user", "content": "Hi"}],
|
||||||
|
[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
|
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
||||||
|
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "What is machine learning?"},
|
||||||
|
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||||
|
{"role": "user", "content": "Can you give me a practical example?"}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
||||||
|
"""Make a single async streaming chat completion request."""
|
||||||
|
messages = random.choice(self.test_messages)
|
||||||
|
payload = {
|
||||||
|
"model": self.model_id,
|
||||||
|
"messages": messages,
|
||||||
|
"stream": True,
|
||||||
|
"max_tokens": 100
|
||||||
|
}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
chunks_received = 0
|
||||||
|
ttft = None
|
||||||
|
error = None
|
||||||
|
|
||||||
|
session = aiohttp.ClientSession()
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with session.post(
|
||||||
|
f"{self.base_url}/chat/completions",
|
||||||
|
headers=self.headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=30)
|
||||||
|
) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for line in response.content:
|
||||||
|
if line:
|
||||||
|
line_str = line.decode('utf-8').strip()
|
||||||
|
if line_str.startswith('data: '):
|
||||||
|
chunks_received += 1
|
||||||
|
if ttft is None:
|
||||||
|
ttft = time.time() - start_time
|
||||||
|
if line_str == 'data: [DONE]':
|
||||||
|
break
|
||||||
|
|
||||||
|
if chunks_received == 0:
|
||||||
|
error = "No streaming chunks received"
|
||||||
|
else:
|
||||||
|
text = await response.text()
|
||||||
|
error = f"HTTP {response.status}: {text[:100]}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error = f"Request error: {str(e)}"
|
||||||
|
finally:
|
||||||
|
await session.close()
|
||||||
|
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
return response_time, chunks_received, ttft, error
|
||||||
|
|
||||||
|
|
||||||
|
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
||||||
|
"""Run benchmark using async requests for specified duration."""
|
||||||
|
stats = BenchmarkStats()
|
||||||
|
stats.concurrent_users = concurrent_users
|
||||||
|
stats.start_time = time.time()
|
||||||
|
|
||||||
|
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
|
||||||
|
print(f"Target URL: {self.base_url}/chat/completions")
|
||||||
|
print(f"Model: {self.model_id}")
|
||||||
|
|
||||||
|
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
||||||
|
async with aiohttp.ClientSession(connector=connector) as session:
|
||||||
|
|
||||||
|
async def worker(worker_id: int):
|
||||||
|
"""Worker that sends requests sequentially until canceled."""
|
||||||
|
request_count = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
response_time, chunks, ttft, error = await self.make_async_streaming_request()
|
||||||
|
await stats.add_result(response_time, chunks, ttft, error)
|
||||||
|
request_count += 1
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
|
||||||
|
|
||||||
|
# Progress reporting task
|
||||||
|
async def progress_reporter():
|
||||||
|
last_report_time = time.time()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(1) # Report every second
|
||||||
|
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
||||||
|
elapsed = time.time() - stats.start_time
|
||||||
|
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
|
||||||
|
last_report_time = time.time()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Spawn concurrent workers
|
||||||
|
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
|
||||||
|
progress_task = asyncio.create_task(progress_reporter())
|
||||||
|
tasks.append(progress_task)
|
||||||
|
|
||||||
|
# Wait for duration then cancel all tasks
|
||||||
|
await asyncio.sleep(duration)
|
||||||
|
|
||||||
|
for task in tasks:
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
stats.end_time = time.time()
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
||||||
|
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
||||||
|
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
||||||
|
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
||||||
|
help="Model ID to use for requests")
|
||||||
|
parser.add_argument("--duration", type=int, default=60,
|
||||||
|
help="Duration in seconds to run benchmark (default: 60)")
|
||||||
|
parser.add_argument("--concurrent", type=int, default=10,
|
||||||
|
help="Number of concurrent users (default: 10)")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
benchmark = LlamaStackBenchmark(args.base_url, args.model)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
|
||||||
|
stats.print_summary()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nBenchmark interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Benchmark failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -1,131 +0,0 @@
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: locust-master
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: locust-master
|
|
||||||
image: locustio/locust:2.31.8
|
|
||||||
ports:
|
|
||||||
- containerPort: 8089 # Web UI
|
|
||||||
- containerPort: 5557 # Master communication
|
|
||||||
env:
|
|
||||||
- name: LOCUST_HOST
|
|
||||||
value: "${LOCUST_HOST}"
|
|
||||||
- name: LOCUST_LOCUSTFILE
|
|
||||||
value: "/locust/locustfile.py"
|
|
||||||
- name: LOCUST_WEB_HOST
|
|
||||||
value: "0.0.0.0"
|
|
||||||
- name: LOCUST_MASTER
|
|
||||||
value: "true"
|
|
||||||
- name: LOCUST_BASE_PATH
|
|
||||||
value: "${LOCUST_BASE_PATH}"
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
|
||||||
volumeMounts:
|
|
||||||
- name: locust-script
|
|
||||||
mountPath: /locust
|
|
||||||
command: ["locust"]
|
|
||||||
args:
|
|
||||||
- "--master"
|
|
||||||
- "--web-host=0.0.0.0"
|
|
||||||
- "--web-port=8089"
|
|
||||||
- "--host=${LOCUST_HOST}"
|
|
||||||
- "--locustfile=/locust/locustfile.py"
|
|
||||||
volumes:
|
|
||||||
- name: locust-script
|
|
||||||
configMap:
|
|
||||||
name: locust-script
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: locust-worker
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: worker
|
|
||||||
spec:
|
|
||||||
replicas: 2 # Start with 2 workers, can be scaled up
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: locust
|
|
||||||
role: worker
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: worker
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: locust-worker
|
|
||||||
image: locustio/locust:2.31.8
|
|
||||||
env:
|
|
||||||
- name: LOCUST_HOST
|
|
||||||
value: "${LOCUST_HOST}"
|
|
||||||
- name: LOCUST_LOCUSTFILE
|
|
||||||
value: "/locust/locustfile.py"
|
|
||||||
- name: LOCUST_MASTER_HOST
|
|
||||||
value: "locust-master-service"
|
|
||||||
- name: LOCUST_MASTER_PORT
|
|
||||||
value: "5557"
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
|
||||||
- name: LOCUST_BASE_PATH
|
|
||||||
value: "${LOCUST_BASE_PATH}"
|
|
||||||
volumeMounts:
|
|
||||||
- name: locust-script
|
|
||||||
mountPath: /locust
|
|
||||||
command: ["locust"]
|
|
||||||
args:
|
|
||||||
- "--worker"
|
|
||||||
- "--master-host=locust-master-service"
|
|
||||||
- "--master-port=5557"
|
|
||||||
- "--locustfile=/locust/locustfile.py"
|
|
||||||
volumes:
|
|
||||||
- name: locust-script
|
|
||||||
configMap:
|
|
||||||
name: locust-script
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: locust-master-service
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
ports:
|
|
||||||
- name: web-ui
|
|
||||||
port: 8089
|
|
||||||
targetPort: 8089
|
|
||||||
- name: master-comm
|
|
||||||
port: 5557
|
|
||||||
targetPort: 5557
|
|
||||||
type: ClusterIP
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: locust-web-ui
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
ports:
|
|
||||||
- port: 8089
|
|
||||||
targetPort: 8089
|
|
||||||
type: ClusterIP # Keep internal, use port-forward to access
|
|
|
@ -1,78 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import random
|
|
||||||
from locust import HttpUser, task, between
|
|
||||||
import os
|
|
||||||
|
|
||||||
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
|
|
||||||
|
|
||||||
MODEL_ID = os.getenv("INFERENCE_MODEL")
|
|
||||||
|
|
||||||
class LlamaStackUser(HttpUser):
|
|
||||||
wait_time = between(0.0, 0.0001)
|
|
||||||
|
|
||||||
def on_start(self):
|
|
||||||
"""Setup authentication and test data."""
|
|
||||||
# No auth required for benchmark server
|
|
||||||
self.headers = {
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Test messages of varying lengths
|
|
||||||
self.test_messages = [
|
|
||||||
[{"role": "user", "content": "Hi"}],
|
|
||||||
[{"role": "user", "content": "What is the capital of France?"}],
|
|
||||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
|
||||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
|
||||||
[
|
|
||||||
{"role": "user", "content": "What is machine learning?"},
|
|
||||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
|
||||||
{"role": "user", "content": "Can you give me a practical example?"}
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
@task(weight=100)
|
|
||||||
def chat_completion_streaming(self):
|
|
||||||
"""Test streaming chat completion (20% of requests)."""
|
|
||||||
messages = random.choice(self.test_messages)
|
|
||||||
payload = {
|
|
||||||
"model": MODEL_ID,
|
|
||||||
"messages": messages,
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 100
|
|
||||||
}
|
|
||||||
|
|
||||||
with self.client.post(
|
|
||||||
f"{base_path}/chat/completions",
|
|
||||||
headers=self.headers,
|
|
||||||
json=payload,
|
|
||||||
stream=True,
|
|
||||||
catch_response=True
|
|
||||||
) as response:
|
|
||||||
if response.status_code == 200:
|
|
||||||
chunks_received = 0
|
|
||||||
try:
|
|
||||||
for line in response.iter_lines():
|
|
||||||
if line:
|
|
||||||
line_str = line.decode('utf-8')
|
|
||||||
if line_str.startswith('data: '):
|
|
||||||
chunks_received += 1
|
|
||||||
if line_str.strip() == 'data: [DONE]':
|
|
||||||
break
|
|
||||||
|
|
||||||
if chunks_received > 0:
|
|
||||||
response.success()
|
|
||||||
else:
|
|
||||||
response.failure("No streaming chunks received")
|
|
||||||
except Exception as e:
|
|
||||||
response.failure(f"Streaming error: {e}")
|
|
||||||
else:
|
|
||||||
response.failure(f"HTTP {response.status_code}: {response.text}")
|
|
|
@ -1,52 +0,0 @@
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: openai-mock
|
|
||||||
labels:
|
|
||||||
app: openai-mock
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: openai-mock
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: openai-mock
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: openai-mock
|
|
||||||
image: python:3.12-slim
|
|
||||||
ports:
|
|
||||||
- containerPort: ${MOCK_INFERENCE_PORT}
|
|
||||||
env:
|
|
||||||
- name: PORT
|
|
||||||
value: "${MOCK_INFERENCE_PORT}"
|
|
||||||
- name: MOCK_MODELS
|
|
||||||
value: "${MOCK_INFERENCE_MODEL}"
|
|
||||||
- name: STREAM_DELAY_SECONDS
|
|
||||||
value: "${STREAM_DELAY_SECONDS}"
|
|
||||||
command: ["sh", "-c"]
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
pip install flask &&
|
|
||||||
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
|
|
||||||
volumeMounts:
|
|
||||||
- name: openai-mock-script
|
|
||||||
mountPath: /app
|
|
||||||
volumes:
|
|
||||||
- name: openai-mock-script
|
|
||||||
configMap:
|
|
||||||
name: openai-mock
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: openai-mock-service
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: openai-mock
|
|
||||||
ports:
|
|
||||||
- port: 8080
|
|
||||||
targetPort: 8080
|
|
||||||
type: ClusterIP
|
|
|
@ -23,7 +23,7 @@ app = Flask(__name__)
|
||||||
|
|
||||||
# Models from environment variables
|
# Models from environment variables
|
||||||
def get_models():
|
def get_models():
|
||||||
models_str = os.getenv("MOCK_MODELS", "mock-inference")
|
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
||||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -49,13 +49,13 @@ def generate_random_text(length=50):
|
||||||
]
|
]
|
||||||
return " ".join(random.choices(words, k=length))
|
return " ".join(random.choices(words, k=length))
|
||||||
|
|
||||||
@app.route('/models', methods=['GET'])
|
@app.route('/v1/models', methods=['GET'])
|
||||||
def list_models():
|
def list_models():
|
||||||
models = get_models()
|
models = get_models()
|
||||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||||
return jsonify(models)
|
return jsonify(models)
|
||||||
|
|
||||||
@app.route('/chat/completions', methods=['POST'])
|
@app.route('/v1/chat/completions', methods=['POST'])
|
||||||
def chat_completions():
|
def chat_completions():
|
||||||
"""Return OpenAI-formatted chat completion responses."""
|
"""Return OpenAI-formatted chat completion responses."""
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
|
|
52
docs/source/distributions/k8s-benchmark/profile_running_server.sh
Executable file
52
docs/source/distributions/k8s-benchmark/profile_running_server.sh
Executable file
|
@ -0,0 +1,52 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
# Script to profile an already running Llama Stack server
|
||||||
|
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
|
||||||
|
|
||||||
|
DURATION=${1:-60} # Default 60 seconds
|
||||||
|
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
|
||||||
|
|
||||||
|
echo "Looking for running Llama Stack server..."
|
||||||
|
|
||||||
|
# Find the server PID
|
||||||
|
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
|
||||||
|
|
||||||
|
|
||||||
|
if [ -z "$SERVER_PID" ]; then
|
||||||
|
echo "Error: No running Llama Stack server found"
|
||||||
|
echo "Please start your server first with:"
|
||||||
|
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Found Llama Stack server with PID: $SERVER_PID"
|
||||||
|
|
||||||
|
# Start py-spy profiling
|
||||||
|
echo "Starting py-spy profiling for ${DURATION} seconds..."
|
||||||
|
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
|
||||||
|
echo ""
|
||||||
|
echo "You can now run your load test..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Get the full path to py-spy
|
||||||
|
PYSPY_PATH=$(which py-spy)
|
||||||
|
|
||||||
|
# Check if running as root, if not, use sudo
|
||||||
|
if [ "$EUID" -ne 0 ]; then
|
||||||
|
echo "py-spy requires root permissions on macOS. Running with sudo..."
|
||||||
|
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
||||||
|
else
|
||||||
|
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
|
||||||
|
echo ""
|
||||||
|
echo "To view the flame graph:"
|
||||||
|
echo "open ${OUTPUT_FILE}.svg"
|
148
docs/source/distributions/k8s-benchmark/run-benchmark.sh
Executable file
148
docs/source/distributions/k8s-benchmark/run-benchmark.sh
Executable file
|
@ -0,0 +1,148 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
TARGET="stack"
|
||||||
|
DURATION=60
|
||||||
|
CONCURRENT=10
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 [options]"
|
||||||
|
echo "Options:"
|
||||||
|
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
||||||
|
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
|
||||||
|
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
|
||||||
|
echo " -h, --help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " $0 --target vllm # Benchmark vLLM direct"
|
||||||
|
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
||||||
|
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
-t|--target)
|
||||||
|
TARGET="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-d|--duration)
|
||||||
|
DURATION="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-c|--concurrent)
|
||||||
|
CONCURRENT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Validate target
|
||||||
|
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
||||||
|
echo "Error: Target must be 'stack' or 'vllm'"
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set configuration based on target
|
||||||
|
if [[ "$TARGET" == "vllm" ]]; then
|
||||||
|
BASE_URL="http://vllm-server:8000/v1"
|
||||||
|
JOB_NAME="vllm-benchmark-job"
|
||||||
|
echo "Benchmarking vLLM direct..."
|
||||||
|
else
|
||||||
|
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
|
||||||
|
JOB_NAME="stack-benchmark-job"
|
||||||
|
echo "Benchmarking Llama Stack..."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Configuration:"
|
||||||
|
echo " Target: $TARGET"
|
||||||
|
echo " Base URL: $BASE_URL"
|
||||||
|
echo " Duration: ${DURATION}s"
|
||||||
|
echo " Concurrent users: $CONCURRENT"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Create temporary job yaml
|
||||||
|
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
|
||||||
|
cat > "$TEMP_YAML" << EOF
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: $JOB_NAME
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: benchmark
|
||||||
|
image: python:3.11-slim
|
||||||
|
command: ["/bin/bash"]
|
||||||
|
args:
|
||||||
|
- "-c"
|
||||||
|
- |
|
||||||
|
pip install aiohttp &&
|
||||||
|
python3 /benchmark/benchmark.py \\
|
||||||
|
--base-url $BASE_URL \\
|
||||||
|
--model \${INFERENCE_MODEL} \\
|
||||||
|
--duration $DURATION \\
|
||||||
|
--concurrent $CONCURRENT
|
||||||
|
env:
|
||||||
|
- name: INFERENCE_MODEL
|
||||||
|
value: "meta-llama/Llama-3.2-3B-Instruct"
|
||||||
|
volumeMounts:
|
||||||
|
- name: benchmark-script
|
||||||
|
mountPath: /benchmark
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "250m"
|
||||||
|
limits:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "500m"
|
||||||
|
volumes:
|
||||||
|
- name: benchmark-script
|
||||||
|
configMap:
|
||||||
|
name: benchmark-script
|
||||||
|
restartPolicy: Never
|
||||||
|
backoffLimit: 3
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Creating benchmark ConfigMap..."
|
||||||
|
kubectl create configmap benchmark-script \
|
||||||
|
--from-file=benchmark.py=benchmark.py \
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
|
||||||
|
echo "Cleaning up any existing benchmark job..."
|
||||||
|
kubectl delete job $JOB_NAME 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "Deploying benchmark Job..."
|
||||||
|
kubectl apply -f "$TEMP_YAML"
|
||||||
|
|
||||||
|
echo "Waiting for job to start..."
|
||||||
|
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
|
||||||
|
|
||||||
|
echo "Following benchmark logs..."
|
||||||
|
kubectl logs -f job/$JOB_NAME
|
||||||
|
|
||||||
|
echo "Job completed. Checking final status..."
|
||||||
|
kubectl get job $JOB_NAME
|
||||||
|
|
||||||
|
# Clean up temporary file
|
||||||
|
rm -f "$TEMP_YAML"
|
|
@ -26,13 +26,6 @@ data:
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
- provider_id: mock-vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
|
||||||
max_tokens: 4096
|
|
||||||
api_token: fake
|
|
||||||
tls_verify: false
|
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -121,9 +114,6 @@ data:
|
||||||
- model_id: ${env.SAFETY_MODEL}
|
- model_id: ${env.SAFETY_MODEL}
|
||||||
provider_id: vllm-safety
|
provider_id: vllm-safety
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
|
||||||
provider_id: mock-vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
shields:
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
|
|
|
@ -44,8 +44,6 @@ spec:
|
||||||
value: "${SAFETY_MODEL}"
|
value: "${SAFETY_MODEL}"
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
- name: TAVILY_SEARCH_API_KEY
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
value: "${TAVILY_SEARCH_API_KEY}"
|
||||||
- name: MOCK_INFERENCE_PORT
|
|
||||||
value: "${MOCK_INFERENCE_PORT}"
|
|
||||||
- name: VLLM_URL
|
- name: VLLM_URL
|
||||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||||
- name: VLLM_MAX_TOKENS
|
- name: VLLM_MAX_TOKENS
|
||||||
|
@ -54,8 +52,6 @@ spec:
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||||
- name: VLLM_TLS_VERIFY
|
- name: VLLM_TLS_VERIFY
|
||||||
value: "false"
|
value: "false"
|
||||||
- name: MOCK_INFERENCE_MODEL
|
|
||||||
value: "${MOCK_INFERENCE_MODEL}"
|
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8323
|
- containerPort: 8323
|
||||||
|
|
|
@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo
|
||||||
apis:
|
apis:
|
||||||
- agents
|
- agents
|
||||||
- inference
|
- inference
|
||||||
- safety
|
|
||||||
- telemetry
|
- telemetry
|
||||||
- tool_runtime
|
- tool_runtime
|
||||||
- vector_io
|
- vector_io
|
||||||
|
@ -16,20 +15,6 @@ providers:
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
- provider_id: vllm-safety
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: mock-vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
|
||||||
max_tokens: 4096
|
|
||||||
api_token: fake
|
|
||||||
tls_verify: false
|
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -45,11 +30,6 @@ providers:
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
safety:
|
|
||||||
- provider_id: llama-guard
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
excluded_categories: []
|
|
||||||
agents:
|
agents:
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
|
@ -115,14 +95,6 @@ models:
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
- model_id: ${env.INFERENCE_MODEL}
|
||||||
provider_id: vllm-inference
|
provider_id: vllm-inference
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- model_id: ${env.SAFETY_MODEL}
|
|
||||||
provider_id: vllm-safety
|
|
||||||
model_type: llm
|
|
||||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
|
||||||
provider_id: mock-vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue