diff --git a/docs/source/distributions/k8s-benchmark/perf_test_config.yaml b/docs/source/distributions/k8s-benchmark/perf_test_config.yaml new file mode 100644 index 000000000..a7150c972 --- /dev/null +++ b/docs/source/distributions/k8s-benchmark/perf_test_config.yaml @@ -0,0 +1,19 @@ +version: '2' +image_name: perf-test-demo +apis: +- inference +providers: + inference: + - provider_id: vllm-inference + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:=http://localhost:8001/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=false} +models: +- model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + model_type: llm +server: + port: 8322 \ No newline at end of file diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml index ceb1ba2d9..5a810639e 100644 --- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml +++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml @@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo apis: - agents - inference +- safety - telemetry - tool_runtime - vector_io @@ -30,6 +31,11 @@ providers: db: ${env.POSTGRES_DB:=llamastack} user: ${env.POSTGRES_USER:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -95,6 +101,8 @@ models: - model_id: ${env.INFERENCE_MODEL} provider_id: vllm-inference model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: [] diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 4b66601bb..8a0693910 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -525,9 +525,8 @@ class InferenceRouter(Inference): response = await self._nonstream_openai_chat_completion(provider, params) - # Store the response with the ID that will be returned to the client if self.store: - await self.store.store_chat_completion(response, messages) + asyncio.create_task(self.store.store_chat_completion(response, messages)) if self.telemetry: metrics = self._construct_metrics( @@ -855,4 +854,4 @@ class InferenceRouter(Inference): object="chat.completion", ) logger.debug(f"InferenceRouter.completion_response: {final_response}") - await self.store.store_chat_completion(final_response, messages) + asyncio.create_task(self.store.store_chat_completion(final_response, messages)) diff --git a/llama_stack/core/server/server.py b/llama_stack/core/server/server.py index d6dfc3435..0ef3e72af 100644 --- a/llama_stack/core/server/server.py +++ b/llama_stack/core/server/server.py @@ -73,6 +73,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.telemetry import ( TelemetryAdapter, ) from llama_stack.providers.utils.telemetry.tracing import ( + BACKGROUND_LOGGER, CURRENT_TRACE_CONTEXT, end_trace, setup_logger, @@ -204,6 +205,10 @@ async def sse_generator(event_gen_coroutine): async def log_request_pre_validation(request: Request): + # Skip expensive body parsing if debug logging is disabled + if not logger.isEnabledFor(logging.DEBUG): + return + if request.method in ("POST", "PUT", "PATCH"): try: body_bytes = await request.body() @@ -305,6 +310,10 @@ class TracingMiddleware: logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}") return await self.app(scope, receive, send) + # Quick exit if telemetry is disabled - skip expensive route matching and tracing + if BACKGROUND_LOGGER is None: + return await self.app(scope, receive, send) + if not hasattr(self, "route_impls"): self.route_impls = initialize_route_impls(self.impls, self.external_apis) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 9e9a80ca5..0242bf2b8 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -362,7 +362,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): return AsyncOpenAI( base_url=self.config.url, api_key=self.config.api_token, - http_client=httpx.AsyncClient(verify=self.config.tls_verify), + http_client=httpx.AsyncClient( + verify=self.config.tls_verify, limits=httpx.Limits(max_connections=1000, max_keepalive_connections=1000) + ), ) async def completion(