Merge remote-tracking branch 'mattf/add-nvidia-inference-adapter' into cdgamarose/add_nvidia_distro

merging matt's changes
2025-12-16 20:22:37 +00:00 · 2024-11-19 17:59:50 +00:00 · 2024-11-19 17:59:50 +00:00 · 18e8f18749
commit 18e8f18749
parent 5e4ac1b7c1 2980a18920
7 changed files with 747 additions and 1 deletions
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -150,4 +150,15 @@ def available_providers() -> List[ProviderSpec]:
                config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
            ),
        ),
        remote_provider_spec(
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="nvidia",
                pip_packages=[
                    "openai",
                ],
                module="llama_stack.providers.remote.inference.nvidia",
                config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
            ),
        ),
    ]
--- a/llama_stack/providers/remote/inference/nvidia/init.py
+++ b/llama_stack/providers/remote/inference/nvidia/init.py
@ -0,0 +1,18 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from ._config import NVIDIAConfig
 from ._nvidia import NVIDIAInferenceAdapter
 async def get_adapter_impl(config: NVIDIAConfig, _deps) -> NVIDIAInferenceAdapter:
    if not isinstance(config, NVIDIAConfig):
        raise RuntimeError(f"Unexpected config type: {type(config)}")
    adapter = NVIDIAInferenceAdapter(config)
    return adapter
 __all__ = ["get_adapter_impl", "NVIDIAConfig"]
--- a/llama_stack/providers/remote/inference/nvidia/_config.py
+++ b/llama_stack/providers/remote/inference/nvidia/_config.py
@ -0,0 +1,52 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@json_schema_type
 class NVIDIAConfig(BaseModel):
    """
    Configuration for the NVIDIA NIM inference endpoint.
    Attributes:
        base_url (str): A base url for accessing the NVIDIA NIM, e.g. http://localhost:8000
        api_key (str): The access key for the hosted NIM endpoints
    There are two ways to access NVIDIA NIMs -
     0. Hosted: Preview APIs hosted at https://integrate.api.nvidia.com
     1. Self-hosted: You can run NVIDIA NIMs on your own infrastructure
    By default the configuration is set to use the hosted APIs. This requires
    an API key which can be obtained from https://ngc.nvidia.com/.
    By default the configuration will attempt to read the NVIDIA_API_KEY environment
    variable to set the api_key. Please do not put your API key in code.
    If you are using a self-hosted NVIDIA NIM, you can set the base_url to the
    URL of your running NVIDIA NIM and do not need to set the api_key.
    """
    base_url: str = Field(
        default="https://integrate.api.nvidia.com",
        description="A base url for accessing the NVIDIA NIM",
    )
    api_key: Optional[str] = Field(
        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
        description="The NVIDIA API key, only needed of using the hosted service",
    )
    timeout: int = Field(
        default=60,
        description="Timeout for the HTTP requests",
    )
    @property
    def is_hosted(self) -> bool:
        return "integrate.api.nvidia.com" in self.base_url
--- a/llama_stack/providers/remote/inference/nvidia/_nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/_nvidia.py
@ -0,0 +1,182 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import warnings
 from typing import AsyncIterator, List, Optional, Union
 from llama_models.datatypes import SamplingParams
 from llama_models.llama3.api.datatypes import (
    InterleavedTextMedia,
    Message,
    ToolChoice,
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_models.sku_list import CoreModelId
 from openai import APIConnectionError, AsyncOpenAI
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseStreamChunk,
    CompletionResponse,
    CompletionResponseStreamChunk,
    EmbeddingsResponse,
    Inference,
    LogProbConfig,
    ResponseFormat,
 )
 from llama_stack.providers.utils.inference.model_registry import (
    build_model_alias_with_just_provider_model_id,
    ModelRegistryHelper,
 )
 from ._config import NVIDIAConfig
 from ._openai_utils import (
    convert_chat_completion_request,
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
 )
 from ._utils import check_health
 _MODEL_ALIASES = [
    build_model_alias_with_just_provider_model_id(
        "meta/llama3-8b-instruct",
        CoreModelId.llama3_8b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama3-70b-instruct",
        CoreModelId.llama3_70b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.1-8b-instruct",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.1-70b-instruct",
        CoreModelId.llama3_1_70b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.1-405b-instruct",
        CoreModelId.llama3_1_405b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.2-1b-instruct",
        CoreModelId.llama3_2_1b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.2-3b-instruct",
        CoreModelId.llama3_2_3b_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.2-11b-vision-instruct",
        CoreModelId.llama3_2_11b_vision_instruct.value,
    ),
    build_model_alias_with_just_provider_model_id(
        "meta/llama-3.2-90b-vision-instruct",
        CoreModelId.llama3_2_90b_vision_instruct.value,
    ),
    # TODO(mf): how do we handle Nemotron models?
    # "Llama3.1-Nemotron-51B-Instruct" -> "meta/llama-3.1-nemotron-51b-instruct",
 ]
 class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
    def __init__(self, config: NVIDIAConfig) -> None:
        # TODO(mf): filter by available models
        ModelRegistryHelper.__init__(self, model_aliases=_MODEL_ALIASES)
        print(f"Initializing NVIDIAInferenceAdapter({config.base_url})...")
        if config.is_hosted:
            if not config.api_key:
                raise RuntimeError(
                    "API key is required for hosted NVIDIA NIM. "
                    "Either provide an API key or use a self-hosted NIM."
                )
        # elif self._config.api_key:
        #
        # we don't raise this warning because a user may have deployed their
        # self-hosted NIM with an API key requirement.
        #
        #     warnings.warn(
        #         "API key is not required for self-hosted NVIDIA NIM. "
        #         "Consider removing the api_key from the configuration."
        #     )
        self._config = config
        # make sure the client lives longer than any async calls
        self._client = AsyncOpenAI(
            base_url=f"{self._config.base_url}/v1",
            api_key=self._config.api_key or "NO KEY",
            timeout=self._config.timeout,
        )
    def completion(
        self,
        model_id: str,
        content: InterleavedTextMedia,
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
        raise NotImplementedError()
    async def embeddings(
        self,
        model_id: str,
        contents: List[InterleavedTextMedia],
    ) -> EmbeddingsResponse:
        raise NotImplementedError()
    async def chat_completion(
        self,
        model_id: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        response_format: Optional[ResponseFormat] = None,
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[
            ToolPromptFormat
        ] = None,  # API default is ToolPromptFormat.json, we default to None to detect user input
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[
        ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
    ]:
        if tool_prompt_format:
            warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring")
        await check_health(self._config)  # this raises errors
        request = convert_chat_completion_request(
            request=ChatCompletionRequest(
                model=self.get_provider_model_id(model_id),
                messages=messages,
                sampling_params=sampling_params,
                tools=tools,
                tool_choice=tool_choice,
                tool_prompt_format=tool_prompt_format,
                stream=stream,
                logprobs=logprobs,
            ),
            n=1,
        )
        try:
            response = await self._client.chat.completions.create(**request)
        except APIConnectionError as e:
            raise ConnectionError(
                f"Failed to connect to NVIDIA NIM at {self._config.base_url}: {e}"
            ) from e
        if stream:
            return convert_openai_chat_completion_stream(response)
        else:
            # we pass n=1 to get only one completion
            return convert_openai_chat_completion_choice(response.choices[0])
--- a/llama_stack/providers/remote/inference/nvidia/_openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/_openai_utils.py
@ -0,0 +1,430 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import warnings
 from typing import Any, AsyncGenerator, Dict, Generator, List, Optional
 from llama_models.llama3.api.datatypes import (
    CompletionMessage,
    StopReason,
    TokenLogProbs,
    ToolCall,
 )
 from openai import AsyncStream
 from openai.types.chat import ChatCompletionChunk as OpenAIChatCompletionChunk
 from openai.types.chat.chat_completion import (
    Choice as OpenAIChoice,
    ChoiceLogprobs as OpenAIChoiceLogprobs,  # same as chat_completion_chunk ChoiceLogprobs
 )
 from openai.types.chat.chat_completion_message_tool_call import (
    ChatCompletionMessageToolCall as OpenAIChatCompletionMessageToolCall,
 )
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseEvent,
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    Message,
    ToolCallDelta,
    ToolCallParseStatus,
 )
 def _convert_message(message: Message) -> Dict:
    """
    Convert a Message to an OpenAI API-compatible dictionary.
    """
    out_dict = message.dict()
    # Llama Stack uses role="ipython" for tool call messages, OpenAI uses "tool"
    if out_dict["role"] == "ipython":
        out_dict.update(role="tool")
    if "stop_reason" in out_dict:
        out_dict.update(stop_reason=out_dict["stop_reason"].value)
    # TODO(mf): tool_calls
    return out_dict
 def convert_chat_completion_request(
    request: ChatCompletionRequest,
    n: int = 1,
 ) -> dict:
    """
    Convert a ChatCompletionRequest to an OpenAI API-compatible dictionary.
    """
    # model -> model
    # messages -> messages
    # sampling_params  TODO(mattf): review strategy
    #  strategy=greedy -> nvext.top_k = -1, temperature = temperature
    #  strategy=top_p -> nvext.top_k = -1, top_p = top_p
    #  strategy=top_k -> nvext.top_k = top_k
    #  temperature -> temperature
    #  top_p -> top_p
    #  top_k -> nvext.top_k
    #  max_tokens -> max_tokens
    #  repetition_penalty -> nvext.repetition_penalty
    # tools -> tools
    # tool_choice ("auto", "required") -> tool_choice
    # tool_prompt_format -> TBD
    # stream -> stream
    # logprobs -> logprobs
    nvext = {}
    payload: Dict[str, Any] = dict(
        model=request.model,
        messages=[_convert_message(message) for message in request.messages],
        stream=request.stream,
        n=n,
        extra_body=dict(nvext=nvext),
        extra_headers={
            b"User-Agent": b"llama-stack: nvidia-inference-adapter",
        },
    )
    if request.tools:
        payload.update(tools=request.tools)
        if request.tool_choice:
            payload.update(
                tool_choice=request.tool_choice.value
            )  # we cannot include tool_choice w/o tools, server will complain
    if request.logprobs:
        payload.update(logprobs=True)
        payload.update(top_logprobs=request.logprobs.top_k)
    if request.sampling_params:
        nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)
        if request.sampling_params.max_tokens:
            payload.update(max_tokens=request.sampling_params.max_tokens)
        if request.sampling_params.strategy == "top_p":
            nvext.update(top_k=-1)
            payload.update(top_p=request.sampling_params.top_p)
        elif request.sampling_params.strategy == "top_k":
            if (
                request.sampling_params.top_k != -1
                and request.sampling_params.top_k < 1
            ):
                warnings.warn("top_k must be -1 or >= 1")
            nvext.update(top_k=request.sampling_params.top_k)
        elif request.sampling_params.strategy == "greedy":
            nvext.update(top_k=-1)
            payload.update(temperature=request.sampling_params.temperature)
    return payload
 def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
    """
    Convert an OpenAI chat completion finish_reason to a StopReason.
    finish_reason: Literal["stop", "length", "tool_calls", ...]
        - stop: model hit a natural stop point or a provided stop sequence
        - length: maximum number of tokens specified in the request was reached
        - tool_calls: model called a tool
    ->
    class StopReason(Enum):
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    # TODO(mf): are end_of_turn and end_of_message semantics correct?
    return {
        "stop": StopReason.end_of_turn,
        "length": StopReason.out_of_tokens,
        "tool_calls": StopReason.end_of_message,
    }.get(finish_reason, StopReason.end_of_turn)
 def _convert_openai_tool_calls(
    tool_calls: List[OpenAIChatCompletionMessageToolCall],
 ) -> List[ToolCall]:
    """
    Convert an OpenAI ChatCompletionMessageToolCall list into a list of ToolCall.
    OpenAI ChatCompletionMessageToolCall:
        id: str
        function: Function
        type: Literal["function"]
    OpenAI Function:
        arguments: str
        name: str
    ->
    ToolCall:
        call_id: str
        tool_name: str
        arguments: Dict[str, ...]
    """
    if not tool_calls:
        return []  # CompletionMessage tool_calls is not optional
    return [
        ToolCall(
            call_id=call.id,
            tool_name=call.function.name,
            arguments=json.loads(call.function.arguments),
        )
        for call in tool_calls
    ]
 def _convert_openai_logprobs(
    logprobs: OpenAIChoiceLogprobs,
 ) -> Optional[List[TokenLogProbs]]:
    """
    Convert an OpenAI ChoiceLogprobs into a list of TokenLogProbs.
    OpenAI ChoiceLogprobs:
        content: Optional[List[ChatCompletionTokenLogprob]]
    OpenAI ChatCompletionTokenLogprob:
        token: str
        logprob: float
        top_logprobs: List[TopLogprob]
    OpenAI TopLogprob:
        token: str
        logprob: float
    ->
    TokenLogProbs:
        logprobs_by_token: Dict[str, float]
         - token, logprob
    """
    if not logprobs:
        return None
    return [
        TokenLogProbs(
            logprobs_by_token={
                logprobs.token: logprobs.logprob for logprobs in content.top_logprobs
            }
        )
        for content in logprobs.content
    ]
 def convert_openai_chat_completion_choice(
    choice: OpenAIChoice,
 ) -> ChatCompletionResponse:
    """
    Convert an OpenAI Choice into a ChatCompletionResponse.
    OpenAI Choice:
        message: ChatCompletionMessage
        finish_reason: str
        logprobs: Optional[ChoiceLogprobs]
    OpenAI ChatCompletionMessage:
        role: Literal["assistant"]
        content: Optional[str]
        tool_calls: Optional[List[ChatCompletionMessageToolCall]]
    ->
    ChatCompletionResponse:
        completion_message: CompletionMessage
        logprobs: Optional[List[TokenLogProbs]]
    CompletionMessage:
        role: Literal["assistant"]
        content: str | ImageMedia | List[str | ImageMedia]
        stop_reason: StopReason
        tool_calls: List[ToolCall]
    class StopReason(Enum):
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    assert (
        hasattr(choice, "message") and choice.message
    ), "error in server response: message not found"
    assert (
        hasattr(choice, "finish_reason") and choice.finish_reason
    ), "error in server response: finish_reason not found"
    return ChatCompletionResponse(
        completion_message=CompletionMessage(
            content=choice.message.content
            or "",  # CompletionMessage content is not optional
            stop_reason=_convert_openai_finish_reason(choice.finish_reason),
            tool_calls=_convert_openai_tool_calls(choice.message.tool_calls),
        ),
        logprobs=_convert_openai_logprobs(choice.logprobs),
    )
 async def convert_openai_chat_completion_stream(
    stream: AsyncStream[OpenAIChatCompletionChunk],
 ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
    """
    Convert a stream of OpenAI chat completion chunks into a stream
    of ChatCompletionResponseStreamChunk.
    OpenAI ChatCompletionChunk:
        choices: List[Choice]
    OpenAI Choice:  # different from the non-streamed Choice
        delta: ChoiceDelta
        finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter", "function_call"]]
        logprobs: Optional[ChoiceLogprobs]
    OpenAI ChoiceDelta:
        content: Optional[str]
        role: Optional[Literal["system", "user", "assistant", "tool"]]
        tool_calls: Optional[List[ChoiceDeltaToolCall]]
    OpenAI ChoiceDeltaToolCall:
        index: int
        id: Optional[str]
        function: Optional[ChoiceDeltaToolCallFunction]
        type: Optional[Literal["function"]]
    OpenAI ChoiceDeltaToolCallFunction:
        name: Optional[str]
        arguments: Optional[str]
    ->
    ChatCompletionResponseStreamChunk:
        event: ChatCompletionResponseEvent
    ChatCompletionResponseEvent:
        event_type: ChatCompletionResponseEventType
        delta: Union[str, ToolCallDelta]
        logprobs: Optional[List[TokenLogProbs]]
        stop_reason: Optional[StopReason]
    ChatCompletionResponseEventType:
        start = "start"
        progress = "progress"
        complete = "complete"
    ToolCallDelta:
        content: Union[str, ToolCall]
        parse_status: ToolCallParseStatus
    ToolCall:
        call_id: str
        tool_name: str
        arguments: str
    ToolCallParseStatus:
        started = "started"
        in_progress = "in_progress"
        failure = "failure"
        success = "success"
    TokenLogProbs:
        logprobs_by_token: Dict[str, float]
         - token, logprob
    StopReason:
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    # generate a stream of ChatCompletionResponseEventType: start -> progress -> progress -> ...
    def _event_type_generator() -> (
        Generator[ChatCompletionResponseEventType, None, None]
    ):
        yield ChatCompletionResponseEventType.start
        while True:
            yield ChatCompletionResponseEventType.progress
    event_type = _event_type_generator()
    # we implement NIM specific semantics, the main difference from OpenAI
    # is that tool_calls are always produced as a complete call. there is no
    # intermediate / partial tool call streamed. because of this, we can
    # simplify the logic and not concern outselves with parse_status of
    # started/in_progress/failed. we can always assume success.
    #
    # a stream of ChatCompletionResponseStreamChunk consists of
    #  0. a start event
    #  1. zero or more progress events
    #   - each progress event has a delta
    #   - each progress event may have a stop_reason
    #   - each progress event may have logprobs
    #   - each progress event may have tool_calls
    #     if a progress event has tool_calls,
    #      it is fully formed and
    #      can be emitted with a parse_status of success
    #  2. a complete event
    stop_reason = None
    async for chunk in stream:
        choice = chunk.choices[0]  # assuming only one choice per chunk
        # we assume there's only one finish_reason in the stream
        stop_reason = _convert_openai_finish_reason(choice.finish_reason) or stop_reason
        # if there's a tool call, emit an event for each tool in the list
        # if tool call and content, emit both separately
        if choice.delta.tool_calls:
            # the call may have content and a tool call. ChatCompletionResponseEvent
            # does not support both, so we emit the content first
            if choice.delta.content:
                yield ChatCompletionResponseStreamChunk(
                    event=ChatCompletionResponseEvent(
                        event_type=next(event_type),
                        delta=choice.delta.content,
                        logprobs=_convert_openai_logprobs(choice.logprobs),
                    )
                )
            # it is possible to have parallel tool calls in stream, but
            # ChatCompletionResponseEvent only supports one per stream
            if len(choice.delta.tool_calls) > 1:
                warnings.warn(
                    "multiple tool calls found in a single delta, using the first, ignoring the rest"
                )
            # NIM only produces fully formed tool calls, so we can assume success
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
                    event_type=next(event_type),
                    delta=ToolCallDelta(
                        content=_convert_openai_tool_calls(choice.delta.tool_calls)[0],
                        parse_status=ToolCallParseStatus.success,
                    ),
                    logprobs=_convert_openai_logprobs(choice.logprobs),
                )
            )
        else:
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
                    event_type=next(event_type),
                    delta=choice.delta.content or "",  # content is not optional
                    logprobs=_convert_openai_logprobs(choice.logprobs),
                )
            )
    yield ChatCompletionResponseStreamChunk(
        event=ChatCompletionResponseEvent(
            event_type=ChatCompletionResponseEventType.complete,
            delta="",
            stop_reason=stop_reason,
        )
    )
--- a/llama_stack/providers/remote/inference/nvidia/_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/_utils.py
@ -0,0 +1,50 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Tuple
 import httpx
 from ._config import NVIDIAConfig
 async def _get_health(url: str) -> Tuple[bool, bool]:
    """
    Query {url}/v1/health/{live,ready} to check if the server is running and ready
    Args:
        url (str): URL of the server
    Returns:
        Tuple[bool, bool]: (is_live, is_ready)
    """
    async with httpx.AsyncClient() as client:
        live = await client.get(f"{url}/v1/health/live")
        ready = await client.get(f"{url}/v1/health/ready")
        return live.status_code == 200, ready.status_code == 200
 async def check_health(config: NVIDIAConfig) -> None:
    """
    Check if the server is running and ready
    Args:
        url (str): URL of the server
    Raises:
        RuntimeError: If the server is not running or ready
    """
    if not config.is_hosted:
        print("Checking NVIDIA NIM health...")
        try:
            is_live, is_ready = await _get_health(config.base_url)
            if not is_live:
                raise ConnectionError("NVIDIA NIM is not running")
            if not is_ready:
                raise ConnectionError("NVIDIA NIM is not ready")
            # TODO(mf): should we wait for the server to be ready?
        except httpx.ConnectError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM: {e}") from e
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@ -29,7 +29,6 @@ def build_model_alias(provider_model_id: str, model_descriptor: str) -> ModelAli
    return ModelAlias(
        provider_model_id=provider_model_id,
        aliases=[
            model_descriptor,
            get_huggingface_repo(model_descriptor),
        ],
        llama_model=model_descriptor,
@ -57,6 +56,10 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
            self.alias_to_provider_id_map[alias_obj.provider_model_id] = (
                alias_obj.provider_model_id
            )
            # ensure we can go from llama model to provider model id
            self.alias_to_provider_id_map[alias_obj.llama_model] = (
                alias_obj.provider_model_id
            )
            self.provider_id_to_llama_model_map[alias_obj.provider_model_id] = (
                alias_obj.llama_model
            )