From a46fe682dcaf17e76e7ddbcd83fe0824273be96d Mon Sep 17 00:00:00 2001 From: Swapna Lekkala Date: Tue, 21 Oct 2025 12:02:45 -0700 Subject: [PATCH] fix segfault in load model --- .../providers/utils/inference/embedding_mixin.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py index 67ce8b532..eafa91cff 100644 --- a/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/llama_stack/providers/utils/inference/embedding_mixin.py @@ -6,9 +6,12 @@ import asyncio import base64 +import platform import struct from typing import TYPE_CHECKING +import torch + from llama_stack.log import get_logger if TYPE_CHECKING: @@ -24,6 +27,8 @@ from llama_stack.apis.inference import ( EMBEDDING_MODELS = {} +DARWIN = "Darwin" + log = get_logger(name=__name__, category="providers::utils") @@ -83,6 +88,17 @@ class SentenceTransformerEmbeddingMixin: def _load_model(): from sentence_transformers import SentenceTransformer + platform_name = platform.system() + if platform_name == DARWIN: + # PyTorch's OpenMP kernels can segfault on macOS when spawned from background + # threads with the default parallel settings, so force a single-threaded CPU run. + log.debug(f"Constraining torch threads on {platform_name} to a single worker") + try: + torch.set_num_threads(1) + torch.set_num_interop_threads(1) + except Exception: + log.debug(f"Failed to adjust torch thread counts on {platform_name}", exc_info=True) + return SentenceTransformer(model, trust_remote_code=True) loaded_model = await asyncio.to_thread(_load_model)