mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-12 12:06:04 +00:00
fix segfault in load model
This commit is contained in:
parent
1ec7216c3f
commit
a46fe682dc
1 changed files with 16 additions and 0 deletions
|
|
@ -6,9 +6,12 @@
|
|||
|
||||
import asyncio
|
||||
import base64
|
||||
import platform
|
||||
import struct
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
from llama_stack.log import get_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
|
@ -24,6 +27,8 @@ from llama_stack.apis.inference import (
|
|||
|
||||
EMBEDDING_MODELS = {}
|
||||
|
||||
DARWIN = "Darwin"
|
||||
|
||||
|
||||
log = get_logger(name=__name__, category="providers::utils")
|
||||
|
||||
|
|
@ -83,6 +88,17 @@ class SentenceTransformerEmbeddingMixin:
|
|||
def _load_model():
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
platform_name = platform.system()
|
||||
if platform_name == DARWIN:
|
||||
# PyTorch's OpenMP kernels can segfault on macOS when spawned from background
|
||||
# threads with the default parallel settings, so force a single-threaded CPU run.
|
||||
log.debug(f"Constraining torch threads on {platform_name} to a single worker")
|
||||
try:
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
except Exception:
|
||||
log.debug(f"Failed to adjust torch thread counts on {platform_name}", exc_info=True)
|
||||
|
||||
return SentenceTransformer(model, trust_remote_code=True)
|
||||
|
||||
loaded_model = await asyncio.to_thread(_load_model)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue