mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-09 13:14:39 +00:00
base: 35 RPS; safety, 75 RPS
# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
This commit is contained in:
parent
faf891b40c
commit
c3fa3e6333
5 changed files with 41 additions and 4 deletions
|
@ -0,0 +1,19 @@
|
|||
version: '2'
|
||||
image_name: perf-test-demo
|
||||
apis:
|
||||
- inference
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8001/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=false}
|
||||
models:
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
server:
|
||||
port: 8322
|
Loading…
Add table
Add a link
Reference in a new issue