base: 35 RPS; safety, 75 RPS

# What does this PR do?


## Test Plan
# What does this PR do?


## Test Plan
This commit is contained in:
Eric Huang 2025-09-02 14:00:23 -07:00
parent faf891b40c
commit c3fa3e6333
5 changed files with 41 additions and 4 deletions

View file

@ -0,0 +1,19 @@
version: '2'
image_name: perf-test-demo
apis:
- inference
providers:
inference:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:=http://localhost:8001/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=false}
models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
server:
port: 8322

View file

@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
apis: apis:
- agents - agents
- inference - inference
- safety
- telemetry - telemetry
- tool_runtime - tool_runtime
- vector_io - vector_io
@ -30,6 +31,11 @@ providers:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents: agents:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
@ -95,6 +101,8 @@ models:
- model_id: ${env.INFERENCE_MODEL} - model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
model_type: llm model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: [] vector_dbs: []
datasets: [] datasets: []
scoring_fns: [] scoring_fns: []

View file

@ -525,9 +525,8 @@ class InferenceRouter(Inference):
response = await self._nonstream_openai_chat_completion(provider, params) response = await self._nonstream_openai_chat_completion(provider, params)
# Store the response with the ID that will be returned to the client
if self.store: if self.store:
await self.store.store_chat_completion(response, messages) asyncio.create_task(self.store.store_chat_completion(response, messages))
if self.telemetry: if self.telemetry:
metrics = self._construct_metrics( metrics = self._construct_metrics(
@ -855,4 +854,4 @@ class InferenceRouter(Inference):
object="chat.completion", object="chat.completion",
) )
logger.debug(f"InferenceRouter.completion_response: {final_response}") logger.debug(f"InferenceRouter.completion_response: {final_response}")
await self.store.store_chat_completion(final_response, messages) asyncio.create_task(self.store.store_chat_completion(final_response, messages))

View file

@ -73,6 +73,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.telemetry import (
TelemetryAdapter, TelemetryAdapter,
) )
from llama_stack.providers.utils.telemetry.tracing import ( from llama_stack.providers.utils.telemetry.tracing import (
BACKGROUND_LOGGER,
CURRENT_TRACE_CONTEXT, CURRENT_TRACE_CONTEXT,
end_trace, end_trace,
setup_logger, setup_logger,
@ -204,6 +205,10 @@ async def sse_generator(event_gen_coroutine):
async def log_request_pre_validation(request: Request): async def log_request_pre_validation(request: Request):
# Skip expensive body parsing if debug logging is disabled
if not logger.isEnabledFor(logging.DEBUG):
return
if request.method in ("POST", "PUT", "PATCH"): if request.method in ("POST", "PUT", "PATCH"):
try: try:
body_bytes = await request.body() body_bytes = await request.body()
@ -305,6 +310,10 @@ class TracingMiddleware:
logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}") logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}")
return await self.app(scope, receive, send) return await self.app(scope, receive, send)
# Quick exit if telemetry is disabled - skip expensive route matching and tracing
if BACKGROUND_LOGGER is None:
return await self.app(scope, receive, send)
if not hasattr(self, "route_impls"): if not hasattr(self, "route_impls"):
self.route_impls = initialize_route_impls(self.impls, self.external_apis) self.route_impls = initialize_route_impls(self.impls, self.external_apis)

View file

@ -362,7 +362,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
return AsyncOpenAI( return AsyncOpenAI(
base_url=self.config.url, base_url=self.config.url,
api_key=self.config.api_token, api_key=self.config.api_token,
http_client=httpx.AsyncClient(verify=self.config.tls_verify), http_client=httpx.AsyncClient(
verify=self.config.tls_verify, limits=httpx.Limits(max_connections=1000, max_keepalive_connections=1000)
),
) )
async def completion( async def completion(