From cb6a5e2687a739bcb1b7fd14a416d015446851f9 Mon Sep 17 00:00:00 2001 From: slekkala1 Date: Tue, 21 Oct 2025 12:21:06 -0700 Subject: [PATCH] fix: fix segfault in load model (#3879) # What does this PR do? Fix segfault with load model The cc-vec integration failed with segfault when used with default embedding model on macOS `model_id: nomic-ai/nomic-embed-text-v1.5` and `provider_id: sentence-transformers` Checked crash report and see this is due to torch OPENMP settings. Constrainting to 1 thread works without crashes. ## Test Plan Tested with cc-vec integration 1. start server llama stack run starter 2. Do the setup in https://github.com/raghotham/cc-vec to set env variables and try `uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "ml-research" --limit 50 --chunk-size 800 --overlap 400` --- .../providers/utils/inference/embedding_mixin.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py index 67ce8b532..c959b9c19 100644 --- a/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/llama_stack/providers/utils/inference/embedding_mixin.py @@ -6,9 +6,12 @@ import asyncio import base64 +import platform import struct from typing import TYPE_CHECKING +import torch + from llama_stack.log import get_logger if TYPE_CHECKING: @@ -24,6 +27,8 @@ from llama_stack.apis.inference import ( EMBEDDING_MODELS = {} +DARWIN = "Darwin" + log = get_logger(name=__name__, category="providers::utils") @@ -83,6 +88,13 @@ class SentenceTransformerEmbeddingMixin: def _load_model(): from sentence_transformers import SentenceTransformer + platform_name = platform.system() + if platform_name == DARWIN: + # PyTorch's OpenMP kernels can segfault on macOS when spawned from background + # threads with the default parallel settings, so force a single-threaded CPU run. + log.debug(f"Constraining torch threads on {platform_name} to a single worker") + torch.set_num_threads(1) + return SentenceTransformer(model, trust_remote_code=True) loaded_model = await asyncio.to_thread(_load_model)