fix segfault in load model

This commit is contained in:
Swapna Lekkala 2025-10-21 12:02:45 -07:00
parent 1ec7216c3f
commit a46fe682dc

View file

@ -6,9 +6,12 @@
import asyncio
import base64
import platform
import struct
from typing import TYPE_CHECKING
import torch
from llama_stack.log import get_logger
if TYPE_CHECKING:
@ -24,6 +27,8 @@ from llama_stack.apis.inference import (
EMBEDDING_MODELS = {}
DARWIN = "Darwin"
log = get_logger(name=__name__, category="providers::utils")
@ -83,6 +88,17 @@ class SentenceTransformerEmbeddingMixin:
def _load_model():
from sentence_transformers import SentenceTransformer
platform_name = platform.system()
if platform_name == DARWIN:
# PyTorch's OpenMP kernels can segfault on macOS when spawned from background
# threads with the default parallel settings, so force a single-threaded CPU run.
log.debug(f"Constraining torch threads on {platform_name} to a single worker")
try:
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
except Exception:
log.debug(f"Failed to adjust torch thread counts on {platform_name}", exc_info=True)
return SentenceTransformer(model, trust_remote_code=True)
loaded_model = await asyncio.to_thread(_load_model)