mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-16 06:27:58 +00:00
chore: setup for performance benchmarking (#3096)
# What does this PR do? 1. Added a simple mock openai-compat server that serves chat/completion 2. Add a benchmark server in EKS that includes mock inference server 3. Add locust (https://locust.io/) file for load testing ## Test Plan bash apply.sh kubectl port-forward service/locust-web-ui 8089:8089 Go to localhost:8089 to start a load test <img width="1392" height="334" alt="image" src="https://github.com/user-attachments/assets/d6aa3deb-583a-42ed-889b-751262b8e91c" /> <img width="1362" height="881" alt="image" src="https://github.com/user-attachments/assets/6a28b9b4-05e6-44e2-b504-07e60c12d35e" />
This commit is contained in:
parent
2f51273215
commit
d6ae54723d
11 changed files with 1234 additions and 3 deletions
57
docs/source/distributions/k8s-benchmark/apply.sh
Executable file
57
docs/source/distributions/k8s-benchmark/apply.sh
Executable file
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
||||
|
||||
export MOCK_INFERENCE_PORT=8080
|
||||
export STREAM_DELAY_SECONDS=0.005
|
||||
|
||||
export POSTGRES_USER=llamastack
|
||||
export POSTGRES_DB=llamastack
|
||||
export POSTGRES_PASSWORD=llamastack
|
||||
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
|
||||
export MOCK_INFERENCE_MODEL=mock-inference
|
||||
|
||||
# Use llama-stack-benchmark-service as the benchmark server
|
||||
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
|
||||
export LOCUST_BASE_PATH=/v1/openai/v1
|
||||
|
||||
# Use vllm-service as the benchmark server
|
||||
# export LOCUST_HOST=http://vllm-server:8000
|
||||
# export LOCUST_BASE_PATH=/v1
|
||||
|
||||
|
||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
|
||||
set -euo pipefail
|
||||
set -x
|
||||
|
||||
# Deploy benchmark-specific components
|
||||
# Deploy OpenAI mock server
|
||||
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
|
||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
||||
|
||||
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
|
||||
|
||||
# Create configmap with our custom stack config
|
||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||
--dry-run=client -o yaml > stack-configmap.yaml
|
||||
|
||||
kubectl apply --validate=false -f stack-configmap.yaml
|
||||
|
||||
# Deploy our custom llama stack server (overriding the base one)
|
||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
||||
|
||||
# Deploy Locust load testing
|
||||
kubectl create configmap locust-script --from-file=locustfile.py \
|
||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
||||
|
||||
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
|
131
docs/source/distributions/k8s-benchmark/locust-k8s.yaml
Normal file
131
docs/source/distributions/k8s-benchmark/locust-k8s.yaml
Normal file
|
@ -0,0 +1,131 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: locust-master
|
||||
labels:
|
||||
app: locust
|
||||
role: master
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: locust
|
||||
role: master
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: locust
|
||||
role: master
|
||||
spec:
|
||||
containers:
|
||||
- name: locust-master
|
||||
image: locustio/locust:2.31.8
|
||||
ports:
|
||||
- containerPort: 8089 # Web UI
|
||||
- containerPort: 5557 # Master communication
|
||||
env:
|
||||
- name: LOCUST_HOST
|
||||
value: "${LOCUST_HOST}"
|
||||
- name: LOCUST_LOCUSTFILE
|
||||
value: "/locust/locustfile.py"
|
||||
- name: LOCUST_WEB_HOST
|
||||
value: "0.0.0.0"
|
||||
- name: LOCUST_MASTER
|
||||
value: "true"
|
||||
- name: LOCUST_BASE_PATH
|
||||
value: "${LOCUST_BASE_PATH}"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
||||
volumeMounts:
|
||||
- name: locust-script
|
||||
mountPath: /locust
|
||||
command: ["locust"]
|
||||
args:
|
||||
- "--master"
|
||||
- "--web-host=0.0.0.0"
|
||||
- "--web-port=8089"
|
||||
- "--host=${LOCUST_HOST}"
|
||||
- "--locustfile=/locust/locustfile.py"
|
||||
volumes:
|
||||
- name: locust-script
|
||||
configMap:
|
||||
name: locust-script
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: locust-worker
|
||||
labels:
|
||||
app: locust
|
||||
role: worker
|
||||
spec:
|
||||
replicas: 2 # Start with 2 workers, can be scaled up
|
||||
selector:
|
||||
matchLabels:
|
||||
app: locust
|
||||
role: worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: locust
|
||||
role: worker
|
||||
spec:
|
||||
containers:
|
||||
- name: locust-worker
|
||||
image: locustio/locust:2.31.8
|
||||
env:
|
||||
- name: LOCUST_HOST
|
||||
value: "${LOCUST_HOST}"
|
||||
- name: LOCUST_LOCUSTFILE
|
||||
value: "/locust/locustfile.py"
|
||||
- name: LOCUST_MASTER_HOST
|
||||
value: "locust-master-service"
|
||||
- name: LOCUST_MASTER_PORT
|
||||
value: "5557"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
||||
- name: LOCUST_BASE_PATH
|
||||
value: "${LOCUST_BASE_PATH}"
|
||||
volumeMounts:
|
||||
- name: locust-script
|
||||
mountPath: /locust
|
||||
command: ["locust"]
|
||||
args:
|
||||
- "--worker"
|
||||
- "--master-host=locust-master-service"
|
||||
- "--master-port=5557"
|
||||
- "--locustfile=/locust/locustfile.py"
|
||||
volumes:
|
||||
- name: locust-script
|
||||
configMap:
|
||||
name: locust-script
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: locust-master-service
|
||||
spec:
|
||||
selector:
|
||||
app: locust
|
||||
role: master
|
||||
ports:
|
||||
- name: web-ui
|
||||
port: 8089
|
||||
targetPort: 8089
|
||||
- name: master-comm
|
||||
port: 5557
|
||||
targetPort: 5557
|
||||
type: ClusterIP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: locust-web-ui
|
||||
spec:
|
||||
selector:
|
||||
app: locust
|
||||
role: master
|
||||
ports:
|
||||
- port: 8089
|
||||
targetPort: 8089
|
||||
type: ClusterIP # Keep internal, use port-forward to access
|
78
docs/source/distributions/k8s-benchmark/locustfile.py
Normal file
78
docs/source/distributions/k8s-benchmark/locustfile.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
|
||||
"""
|
||||
|
||||
import random
|
||||
from locust import HttpUser, task, between
|
||||
import os
|
||||
|
||||
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
|
||||
|
||||
MODEL_ID = os.getenv("INFERENCE_MODEL")
|
||||
|
||||
class LlamaStackUser(HttpUser):
|
||||
wait_time = between(0.0, 0.0001)
|
||||
|
||||
def on_start(self):
|
||||
"""Setup authentication and test data."""
|
||||
# No auth required for benchmark server
|
||||
self.headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Test messages of varying lengths
|
||||
self.test_messages = [
|
||||
[{"role": "user", "content": "Hi"}],
|
||||
[{"role": "user", "content": "What is the capital of France?"}],
|
||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
||||
[
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||
{"role": "user", "content": "Can you give me a practical example?"}
|
||||
]
|
||||
]
|
||||
|
||||
@task(weight=100)
|
||||
def chat_completion_streaming(self):
|
||||
"""Test streaming chat completion (20% of requests)."""
|
||||
messages = random.choice(self.test_messages)
|
||||
payload = {
|
||||
"model": MODEL_ID,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"max_tokens": 100
|
||||
}
|
||||
|
||||
with self.client.post(
|
||||
f"{base_path}/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
stream=True,
|
||||
catch_response=True
|
||||
) as response:
|
||||
if response.status_code == 200:
|
||||
chunks_received = 0
|
||||
try:
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
line_str = line.decode('utf-8')
|
||||
if line_str.startswith('data: '):
|
||||
chunks_received += 1
|
||||
if line_str.strip() == 'data: [DONE]':
|
||||
break
|
||||
|
||||
if chunks_received > 0:
|
||||
response.success()
|
||||
else:
|
||||
response.failure("No streaming chunks received")
|
||||
except Exception as e:
|
||||
response.failure(f"Streaming error: {e}")
|
||||
else:
|
||||
response.failure(f"HTTP {response.status_code}: {response.text}")
|
|
@ -0,0 +1,52 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: openai-mock
|
||||
labels:
|
||||
app: openai-mock
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: openai-mock
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: openai-mock
|
||||
spec:
|
||||
containers:
|
||||
- name: openai-mock
|
||||
image: python:3.12-slim
|
||||
ports:
|
||||
- containerPort: ${MOCK_INFERENCE_PORT}
|
||||
env:
|
||||
- name: PORT
|
||||
value: "${MOCK_INFERENCE_PORT}"
|
||||
- name: MOCK_MODELS
|
||||
value: "${MOCK_INFERENCE_MODEL}"
|
||||
- name: STREAM_DELAY_SECONDS
|
||||
value: "${STREAM_DELAY_SECONDS}"
|
||||
command: ["sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
pip install flask &&
|
||||
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
|
||||
volumeMounts:
|
||||
- name: openai-mock-script
|
||||
mountPath: /app
|
||||
volumes:
|
||||
- name: openai-mock-script
|
||||
configMap:
|
||||
name: openai-mock
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: openai-mock-service
|
||||
spec:
|
||||
selector:
|
||||
app: openai-mock
|
||||
ports:
|
||||
- port: 8080
|
||||
targetPort: 8080
|
||||
type: ClusterIP
|
190
docs/source/distributions/k8s-benchmark/openai-mock-server.py
Normal file
190
docs/source/distributions/k8s-benchmark/openai-mock-server.py
Normal file
|
@ -0,0 +1,190 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
OpenAI-compatible mock server that returns:
|
||||
- Hardcoded /models response for consistent validation
|
||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
||||
"""
|
||||
|
||||
from flask import Flask, request, jsonify, Response
|
||||
import time
|
||||
import random
|
||||
import uuid
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Models from environment variables
|
||||
def get_models():
|
||||
models_str = os.getenv("MOCK_MODELS", "mock-inference")
|
||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": model_id,
|
||||
"object": "model",
|
||||
"created": 1234567890,
|
||||
"owned_by": "vllm"
|
||||
}
|
||||
for model_id in model_ids
|
||||
]
|
||||
}
|
||||
|
||||
def generate_random_text(length=50):
|
||||
"""Generate random but coherent text for responses."""
|
||||
words = [
|
||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
||||
]
|
||||
return " ".join(random.choices(words, k=length))
|
||||
|
||||
@app.route('/models', methods=['GET'])
|
||||
def list_models():
|
||||
models = get_models()
|
||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||
return jsonify(models)
|
||||
|
||||
@app.route('/chat/completions', methods=['POST'])
|
||||
def chat_completions():
|
||||
"""Return OpenAI-formatted chat completion responses."""
|
||||
data = request.get_json()
|
||||
default_model = get_models()['data'][0]['id']
|
||||
model = data.get('model', default_model)
|
||||
messages = data.get('messages', [])
|
||||
stream = data.get('stream', False)
|
||||
|
||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
||||
|
||||
if stream:
|
||||
return handle_streaming_completion(model, messages)
|
||||
else:
|
||||
return handle_non_streaming_completion(model, messages)
|
||||
|
||||
def handle_non_streaming_completion(model, messages):
|
||||
response_text = generate_random_text(random.randint(20, 80))
|
||||
|
||||
# Calculate realistic token counts
|
||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
||||
completion_tokens = len(response_text.split())
|
||||
|
||||
response = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
}
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
def handle_streaming_completion(model, messages):
|
||||
def generate_stream():
|
||||
# Generate response text
|
||||
full_response = generate_random_text(random.randint(30, 100))
|
||||
words = full_response.split()
|
||||
|
||||
# Send initial chunk
|
||||
initial_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"role": "assistant", "content": ""}
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
||||
|
||||
# Send word by word
|
||||
for i, word in enumerate(words):
|
||||
chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
# Configurable delay to simulate realistic streaming
|
||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
||||
time.sleep(stream_delay)
|
||||
|
||||
# Send final chunk
|
||||
final_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": ""},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return Response(
|
||||
generate_stream(),
|
||||
mimetype='text/event-stream',
|
||||
headers={
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
}
|
||||
)
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
||||
parser.add_argument('--port', type=int, default=8081,
|
||||
help='Port to run the server on (default: 8081)')
|
||||
args = parser.parse_args()
|
||||
|
||||
port = args.port
|
||||
|
||||
models = get_models()
|
||||
print("Starting OpenAI-compatible mock server...")
|
||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
||||
print("- Streaming support with valid SSE format")
|
||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
||||
app.run(host='0.0.0.0', port=port, debug=False)
|
143
docs/source/distributions/k8s-benchmark/stack-configmap.yaml
Normal file
143
docs/source/distributions/k8s-benchmark/stack-configmap.yaml
Normal file
|
@ -0,0 +1,143 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
stack_run_config.yaml: |
|
||||
version: '2'
|
||||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: mock-vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
||||
max_tokens: 4096
|
||||
api_token: fake
|
||||
tls_verify: false
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
||||
provider_id: mock-vllm-inference
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8323
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: llama-stack-config
|
|
@ -0,0 +1,87 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: llama-benchmark-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-stack-benchmark-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack-benchmark
|
||||
image: llamastack/distribution-starter:latest
|
||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
||||
env:
|
||||
- name: ENABLE_CHROMADB
|
||||
value: "true"
|
||||
- name: CHROMADB_URL
|
||||
value: http://chromadb.default.svc.cluster.local:6000
|
||||
- name: POSTGRES_HOST
|
||||
value: postgres-server.default.svc.cluster.local
|
||||
- name: POSTGRES_PORT
|
||||
value: "5432"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: SAFETY_MODEL
|
||||
value: "${SAFETY_MODEL}"
|
||||
- name: TAVILY_SEARCH_API_KEY
|
||||
value: "${TAVILY_SEARCH_API_KEY}"
|
||||
- name: MOCK_INFERENCE_PORT
|
||||
value: "${MOCK_INFERENCE_PORT}"
|
||||
- name: VLLM_URL
|
||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||
- name: VLLM_MAX_TOKENS
|
||||
value: "3072"
|
||||
- name: VLLM_SAFETY_URL
|
||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
- name: MOCK_INFERENCE_MODEL
|
||||
value: "${MOCK_INFERENCE_MODEL}"
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
||||
ports:
|
||||
- containerPort: 8323
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.llama
|
||||
- name: llama-config
|
||||
mountPath: /etc/config
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-benchmark-pvc
|
||||
- name: llama-config
|
||||
configMap:
|
||||
name: llama-stack-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-stack-benchmark-service
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
ports:
|
||||
- name: http
|
||||
port: 8323
|
||||
targetPort: 8323
|
||||
type: ClusterIP
|
136
docs/source/distributions/k8s-benchmark/stack_run_config.yaml
Normal file
136
docs/source/distributions/k8s-benchmark/stack_run_config.yaml
Normal file
|
@ -0,0 +1,136 @@
|
|||
version: '2'
|
||||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: mock-vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
||||
max_tokens: 4096
|
||||
api_token: fake
|
||||
tls_verify: false
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
||||
provider_id: mock-vllm-inference
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8323
|
Loading…
Add table
Add a link
Reference in a new issue