From 5bbca56cfc24cd1a1d4d5aff6a9c1c4ad12a741a Mon Sep 17 00:00:00 2001 From: Derek Higgins Date: Thu, 4 Sep 2025 18:58:41 +0100 Subject: [PATCH 1/5] fix: Make SentenceTransformer embedding operations non-blocking (#3335) - Wrap model loading with asyncio.to_thread() to prevent blocking during model download/initialization - Wrap encoding operations with asyncio.to_thread() to run in background thread - Convert _load_sentence_transformer_model() to async method This ensures the async event loop remains responsive during embedding operations. Closes: #3332 Signed-off-by: Derek Higgins Co-authored-by: Francisco Arceo --- .../utils/inference/embedding_mixin.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py index 65ba2854b..9bd0aa8ce 100644 --- a/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/llama_stack/providers/utils/inference/embedding_mixin.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import asyncio import base64 import struct from typing import TYPE_CHECKING @@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin: task_type: EmbeddingTaskType | None = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) - embedding_model = self._load_sentence_transformer_model(model.provider_resource_id) - embeddings = embedding_model.encode( - [interleaved_content_as_str(content) for content in contents], show_progress_bar=False + embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id) + embeddings = await asyncio.to_thread( + embedding_model.encode, + [interleaved_content_as_str(content) for content in contents], + show_progress_bar=False, ) return EmbeddingsResponse(embeddings=embeddings) @@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin: # Get the model and generate embeddings model_obj = await self.model_store.get_model(model) - embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id) - embeddings = embedding_model.encode(input_list, show_progress_bar=False) + embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id) + embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False) # Convert embeddings to the requested format data = [] @@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin: usage=usage, ) - def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer": + async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer": global EMBEDDING_MODELS loaded_model = EMBEDDING_MODELS.get(model) @@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin: return loaded_model log.info(f"Loading sentence transformer for {model}...") - from sentence_transformers import SentenceTransformer - loaded_model = SentenceTransformer(model) + def _load_model(): + from sentence_transformers import SentenceTransformer + + return SentenceTransformer(model) + + loaded_model = await asyncio.to_thread(_load_model) EMBEDDING_MODELS[model] = loaded_model return loaded_model From bcc7f2c7d0a24a4671e98134e6c28f74e6acff26 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Thu, 4 Sep 2025 11:37:46 -0700 Subject: [PATCH 2/5] chore: async inference store write (#3318) # What does this PR do? ## Test Plan ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 30 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` Before: ============================================================ BENCHMARK RESULTS ============================================================ Total time: 30.00s Concurrent users: 50 Total requests: 1267 Successful requests: 1267 Failed requests: 0 Success rate: 100.0% Requests per second: 42.23 After: ============================================================ BENCHMARK RESULTS ============================================================ Total time: 30.00s Concurrent users: 50 Total requests: 1449 Successful requests: 1449 Failed requests: 0 Success rate: 100.0% Requests per second: 48.30 --- .../distributions/k8s-benchmark/stack_run_config.yaml | 8 ++++++++ llama_stack/core/routers/inference.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml index ceb1ba2d9..5a810639e 100644 --- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml +++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml @@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo apis: - agents - inference +- safety - telemetry - tool_runtime - vector_io @@ -30,6 +31,11 @@ providers: db: ${env.POSTGRES_DB:=llamastack} user: ${env.POSTGRES_USER:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -95,6 +101,8 @@ models: - model_id: ${env.INFERENCE_MODEL} provider_id: vllm-inference model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: [] diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 8dcad85e3..045093fe0 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -527,7 +527,7 @@ class InferenceRouter(Inference): # Store the response with the ID that will be returned to the client if self.store: - await self.store.store_chat_completion(response, messages) + asyncio.create_task(self.store.store_chat_completion(response, messages)) if self.telemetry: metrics = self._construct_metrics( @@ -855,4 +855,4 @@ class InferenceRouter(Inference): object="chat.completion", ) logger.debug(f"InferenceRouter.completion_response: {final_response}") - await self.store.store_chat_completion(final_response, messages) + asyncio.create_task(self.store.store_chat_completion(final_response, messages)) From 561d2fc6b8226f167bb9782e6619d86a092af8e8 Mon Sep 17 00:00:00 2001 From: slekkala1 Date: Thu, 4 Sep 2025 11:47:46 -0700 Subject: [PATCH 3/5] fix: Move to older version for docker container failure [fireworks-ai] (#3338) # What does this PR do? Noticed the test https://github.com/llamastack/llama-stack-ops/actions/workflows/test-maybe-cut.yaml are still failing randomly. Earlier fixed this with 0.18.0 of fireworks here https://github.com/llamastack/llama-stack/pull/3267, the local testing may have inadvertently picked a lower version with `<=` which I assumed picks latest version. Now tested with `==` to find the version where it broke and pinning to version(`<=`) where it was passing. ## Test Plan Tested locally with the following commands to start a container Build container `llama stack build --distro starter --image-type container` start container `docker run -d -p 8321:8321 --name llama-stack-test distribution-starter:0.2.20` check health `http://localhost:8321/v1/health` Above steps fails without the fix Tested with `==` to ensure the same version is picked in local testing instead of anything lower. Following here for the fix from `fireworks-ai` https://discord.com/channels/1137072072808472616/1410674695597981778/1410674695597981778 https://github.com/llamastack/llama-stack/issues/3273 --- llama_stack/providers/registry/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index fb841afdf..50956f58c 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]: adapter=AdapterSpec( adapter_type="fireworks", pip_packages=[ - "fireworks-ai<=0.18.0", + "fireworks-ai<=0.17.16", ], module="llama_stack.providers.remote.inference.fireworks", config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig", From 55a8c5f439b710cfbb1256e3d6c41a6d82601970 Mon Sep 17 00:00:00 2001 From: Sumanth Kamenani Date: Thu, 4 Sep 2025 16:25:02 -0400 Subject: [PATCH 4/5] fix: show descriptive MCP server connection errors instead of generic 500s (#3256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit What does this PR do? Fixes error handling when MCP server connections fail. Instead of returning generic 500 errors, now provides descriptive error messages with proper HTTP status codes. Closes #3107 Test Plan Before fix: curl -X GET "http://localhost:8321/v1/tool-runtime/list-tools?tool_group_id=bad-mcp-server" Returns: {"detail": "Internal server error: An unexpected error occurred."} (500) After fix: curl -X GET "http://localhost:8321/v1/tool-runtime/list-tools?tool_group_id=bad-mcp-server" Returns: {"error": {"detail": "Failed to connect to MCP server at http://localhost:9999/sse: Connection refused"}} (502) Tests: - Added unit test for ConnectionError → 502 translation - Manually tested with unreachable MCP servers (connection refused) --- llama_stack/core/server/server.py | 2 ++ llama_stack/providers/utils/tools/mcp.py | 32 ++++++++++++++++++++++++ tests/unit/server/test_server.py | 9 +++++++ 3 files changed, 43 insertions(+) diff --git a/llama_stack/core/server/server.py b/llama_stack/core/server/server.py index b247a610d..288bf46e1 100644 --- a/llama_stack/core/server/server.py +++ b/llama_stack/core/server/server.py @@ -141,6 +141,8 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc)) elif isinstance(exc, PermissionError | AccessDeniedError): return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}") + elif isinstance(exc, ConnectionError | httpx.ConnectError): + return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc)) elif isinstance(exc, asyncio.TimeoutError | TimeoutError): return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}") elif isinstance(exc, NotImplementedError): diff --git a/llama_stack/providers/utils/tools/mcp.py b/llama_stack/providers/utils/tools/mcp.py index 02f7aaf8a..fc8e2f377 100644 --- a/llama_stack/providers/utils/tools/mcp.py +++ b/llama_stack/providers/utils/tools/mcp.py @@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat raise AuthenticationRequiredError(exc) from exc if i == len(connection_strategies) - 1: raise + except* httpx.ConnectError as eg: + # Connection refused, server down, network unreachable + if i == len(connection_strategies) - 1: + error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused" + logger.error(f"MCP connection error: {error_msg}") + raise ConnectionError(error_msg) from eg + else: + logger.warning( + f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}" + ) + except* httpx.TimeoutException as eg: + # Request timeout, server too slow + if i == len(connection_strategies) - 1: + error_msg = f"MCP server at {endpoint} timed out" + logger.error(f"MCP timeout error: {error_msg}") + raise TimeoutError(error_msg) from eg + else: + logger.warning( + f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}" + ) + except* httpx.RequestError as eg: + # DNS resolution failures, network errors, invalid URLs + if i == len(connection_strategies) - 1: + # Get the first exception's message for the error string + exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error" + error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}" + logger.error(f"MCP network error: {error_msg}") + raise ConnectionError(error_msg) from eg + else: + logger.warning( + f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}" + ) except* McpError: if i < len(connection_strategies) - 1: logger.warning( diff --git a/tests/unit/server/test_server.py b/tests/unit/server/test_server.py index 803111fc7..f21bbdd67 100644 --- a/tests/unit/server/test_server.py +++ b/tests/unit/server/test_server.py @@ -113,6 +113,15 @@ class TestTranslateException: assert result.status_code == 504 assert result.detail == "Operation timed out: " + def test_translate_connection_error(self): + """Test that ConnectionError is translated to 502 HTTP status.""" + exc = ConnectionError("Failed to connect to MCP server at http://localhost:9999/sse: Connection refused") + result = translate_exception(exc) + + assert isinstance(result, HTTPException) + assert result.status_code == 502 + assert result.detail == "Failed to connect to MCP server at http://localhost:9999/sse: Connection refused" + def test_translate_not_implemented_error(self): """Test that NotImplementedError is translated to 501 HTTP status.""" exc = NotImplementedError("Not implemented") From 3a7ac4227d4a2c0a74e3a7e5eda459ae19761c03 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Thu, 4 Sep 2025 15:13:31 -0700 Subject: [PATCH 5/5] chore: unbreak inference store test (#3340) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? The inference store writes were moved to asyncio.create_task and not await anymore ## Test Plan ❯ OLLAMA_URL=http://localhost:11434 LLAMA_STACK_CONFIG=server:starter uv run --with pytest-repeat pytest tests/integration/inference --text-model="ollama/llama3.2:3b-instruct-fp16" -vvs -k "test_inference_store_tool_calls and 3b-instruct-fp16-True" --count=10 Uninstalled 2 packages in 102ms Installed 2 packages in 138ms INFO 2025-09-04 14:10:17,775 tests.integration.conftest:66 tests: Setting DISABLE_CODE_SANDBOX=1 for macOS ========================================================================================================== test session starts =========================================================================================================== platform darwin -- Python 3.12.3, pytest-8.4.1, pluggy-1.6.0 -- /Users/erichuang/.cache/uv/builds-v0/.tmpSGMlgt/bin/python cachedir: .pytest_cache metadata: {'Python': '3.12.3', 'Platform': 'macOS-15.6.1-arm64-arm-64bit', 'Packages': {'pytest': '8.4.1', 'pluggy': '1.6.0'}, 'Plugins': {'repeat': '0.9.4', 'anyio': '4.9.0', 'html': '4.1.1', 'socket': '0.7.0', 'asyncio': '1.1.0', 'json-report': '1.5.0', 'timeout': '2.4.0', 'metadata': '3.1.1', 'cov': '6.2.1', 'nbval': '0.11.0'}} rootdir: /Users/erichuang/projects/llama-stack-git configfile: pyproject.toml plugins: repeat-0.9.4, anyio-4.9.0, html-4.1.1, socket-0.7.0, asyncio-1.1.0, json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, cov-6.2.1, nbval-0.11.0 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 970 items / 950 deselected / 20 selected tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-1-10] instantiating llama_stack_client Starting llama stack server with config 'starter' on port 8321... Waiting for server at http://localhost:8321... (0.0s elapsed) Waiting for server at http://localhost:8321... (0.5s elapsed) Waiting for server at http://localhost:8321... (5.1s elapsed) Waiting for server at http://localhost:8321... (5.6s elapsed) Waiting for server at http://localhost:8321... (10.1s elapsed) Waiting for server at http://localhost:8321... (10.6s elapsed) Waiting for server at http://localhost:8321... (15.2s elapsed) Waiting for server at http://localhost:8321... (15.7s elapsed) Server is ready at http://localhost:8321 llama_stack_client instantiated in 20.583s PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-2-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-3-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-4-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-5-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-6-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-7-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-8-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-9-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True-10-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-1-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-2-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-3-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-4-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-5-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-6-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-7-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-8-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-9-10] PASSED tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True-10-10] PASSEDTerminating llama stack server process... Terminating process 53307 and its group... Server process and children terminated gracefully --- .../inference/test_openai_completion.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 72137662d..62185e470 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -5,6 +5,8 @@ # the root directory of this source tree. +import time + import pytest from ..test_cases.test_case import TestCase @@ -323,8 +325,15 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea response_id = response.id content = response.choices[0].message.content - responses = client.chat.completions.list(limit=1000) - assert response_id in [r.id for r in responses.data] + tries = 0 + while tries < 10: + responses = client.chat.completions.list(limit=1000) + if response_id in [r.id for r in responses.data]: + break + else: + tries += 1 + time.sleep(0.1) + assert tries < 10, f"Response {response_id} not found after 1 second" retrieved_response = client.chat.completions.retrieve(response_id) assert retrieved_response.id == response_id @@ -388,6 +397,18 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode response_id = response.id content = response.choices[0].message.content + # wait for the response to be stored + tries = 0 + while tries < 10: + responses = client.chat.completions.list(limit=1000) + if response_id in [r.id for r in responses.data]: + break + else: + tries += 1 + time.sleep(0.1) + + assert tries < 10, f"Response {response_id} not found after 1 second" + responses = client.chat.completions.list(limit=1000) assert response_id in [r.id for r in responses.data]