fix segfault in load model

2025-12-12 12:06:04 +00:00 · 2025-10-21 12:02:45 -07:00 · 2025-10-21 12:02:45 -07:00 · a46fe682dc
commit a46fe682dc
parent 1ec7216c3f
1 changed files with 16 additions and 0 deletions
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@ -6,9 +6,12 @@

 import asyncio
 import base64
+import platform
 import struct
 from typing import TYPE_CHECKING

+import torch
+
 from llama_stack.log import get_logger

 if TYPE_CHECKING:
@ -24,6 +27,8 @@ from llama_stack.apis.inference import (

 EMBEDDING_MODELS = {}

+DARWIN = "Darwin"
+

 log = get_logger(name=__name__, category="providers::utils")

@ -83,6 +88,17 @@ class SentenceTransformerEmbeddingMixin:
        def _load_model():
            from sentence_transformers import SentenceTransformer

+            platform_name = platform.system()
+            if platform_name == DARWIN:
+                # PyTorch's OpenMP kernels can segfault on macOS when spawned from background
+                # threads with the default parallel settings, so force a single-threaded CPU run.
+                log.debug(f"Constraining torch threads on {platform_name} to a single worker")
+                try:
+                    torch.set_num_threads(1)
+                    torch.set_num_interop_threads(1)
+                except Exception:
+                    log.debug(f"Failed to adjust torch thread counts on {platform_name}", exc_info=True)
+
            return SentenceTransformer(model, trust_remote_code=True)

        loaded_model = await asyncio.to_thread(_load_model)