mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-07 04:45:44 +00:00
base: 35 RPS; safety, 75 RPS
# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
This commit is contained in:
parent
faf891b40c
commit
c3fa3e6333
5 changed files with 41 additions and 4 deletions
|
@ -0,0 +1,19 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: perf-test-demo
|
||||||
|
apis:
|
||||||
|
- inference
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: vllm-inference
|
||||||
|
provider_type: remote::vllm
|
||||||
|
config:
|
||||||
|
url: ${env.VLLM_URL:=http://localhost:8001/v1}
|
||||||
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
|
tls_verify: ${env.VLLM_TLS_VERIFY:=false}
|
||||||
|
models:
|
||||||
|
- model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: vllm-inference
|
||||||
|
model_type: llm
|
||||||
|
server:
|
||||||
|
port: 8322
|
|
@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
|
||||||
apis:
|
apis:
|
||||||
- agents
|
- agents
|
||||||
- inference
|
- inference
|
||||||
|
- safety
|
||||||
- telemetry
|
- telemetry
|
||||||
- tool_runtime
|
- tool_runtime
|
||||||
- vector_io
|
- vector_io
|
||||||
|
@ -30,6 +31,11 @@ providers:
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config:
|
||||||
|
excluded_categories: []
|
||||||
agents:
|
agents:
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
|
@ -95,6 +101,8 @@ models:
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
- model_id: ${env.INFERENCE_MODEL}
|
||||||
provider_id: vllm-inference
|
provider_id: vllm-inference
|
||||||
model_type: llm
|
model_type: llm
|
||||||
|
shields:
|
||||||
|
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
|
@ -525,9 +525,8 @@ class InferenceRouter(Inference):
|
||||||
|
|
||||||
response = await self._nonstream_openai_chat_completion(provider, params)
|
response = await self._nonstream_openai_chat_completion(provider, params)
|
||||||
|
|
||||||
# Store the response with the ID that will be returned to the client
|
|
||||||
if self.store:
|
if self.store:
|
||||||
await self.store.store_chat_completion(response, messages)
|
asyncio.create_task(self.store.store_chat_completion(response, messages))
|
||||||
|
|
||||||
if self.telemetry:
|
if self.telemetry:
|
||||||
metrics = self._construct_metrics(
|
metrics = self._construct_metrics(
|
||||||
|
@ -855,4 +854,4 @@ class InferenceRouter(Inference):
|
||||||
object="chat.completion",
|
object="chat.completion",
|
||||||
)
|
)
|
||||||
logger.debug(f"InferenceRouter.completion_response: {final_response}")
|
logger.debug(f"InferenceRouter.completion_response: {final_response}")
|
||||||
await self.store.store_chat_completion(final_response, messages)
|
asyncio.create_task(self.store.store_chat_completion(final_response, messages))
|
||||||
|
|
|
@ -73,6 +73,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.telemetry import (
|
||||||
TelemetryAdapter,
|
TelemetryAdapter,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.telemetry.tracing import (
|
from llama_stack.providers.utils.telemetry.tracing import (
|
||||||
|
BACKGROUND_LOGGER,
|
||||||
CURRENT_TRACE_CONTEXT,
|
CURRENT_TRACE_CONTEXT,
|
||||||
end_trace,
|
end_trace,
|
||||||
setup_logger,
|
setup_logger,
|
||||||
|
@ -204,6 +205,10 @@ async def sse_generator(event_gen_coroutine):
|
||||||
|
|
||||||
|
|
||||||
async def log_request_pre_validation(request: Request):
|
async def log_request_pre_validation(request: Request):
|
||||||
|
# Skip expensive body parsing if debug logging is disabled
|
||||||
|
if not logger.isEnabledFor(logging.DEBUG):
|
||||||
|
return
|
||||||
|
|
||||||
if request.method in ("POST", "PUT", "PATCH"):
|
if request.method in ("POST", "PUT", "PATCH"):
|
||||||
try:
|
try:
|
||||||
body_bytes = await request.body()
|
body_bytes = await request.body()
|
||||||
|
@ -305,6 +310,10 @@ class TracingMiddleware:
|
||||||
logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}")
|
logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}")
|
||||||
return await self.app(scope, receive, send)
|
return await self.app(scope, receive, send)
|
||||||
|
|
||||||
|
# Quick exit if telemetry is disabled - skip expensive route matching and tracing
|
||||||
|
if BACKGROUND_LOGGER is None:
|
||||||
|
return await self.app(scope, receive, send)
|
||||||
|
|
||||||
if not hasattr(self, "route_impls"):
|
if not hasattr(self, "route_impls"):
|
||||||
self.route_impls = initialize_route_impls(self.impls, self.external_apis)
|
self.route_impls = initialize_route_impls(self.impls, self.external_apis)
|
||||||
|
|
||||||
|
|
|
@ -362,7 +362,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
return AsyncOpenAI(
|
return AsyncOpenAI(
|
||||||
base_url=self.config.url,
|
base_url=self.config.url,
|
||||||
api_key=self.config.api_token,
|
api_key=self.config.api_token,
|
||||||
http_client=httpx.AsyncClient(verify=self.config.tls_verify),
|
http_client=httpx.AsyncClient(
|
||||||
|
verify=self.config.tls_verify, limits=httpx.Limits(max_connections=1000, max_keepalive_connections=1000)
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def completion(
|
async def completion(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue