Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-28 04:41:58 +00:00 · 2025-03-19 16:51:59 -05:00 · 2025-03-19 16:51:59 -05:00 · 02a4f9ac59
commit 02a4f9ac59
parent c2c27a7b2b ab777ef5cd
69 changed files with 1128 additions and 445 deletions
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -6,6 +6,7 @@

 import logging
 import warnings
+from functools import lru_cache
 from typing import AsyncIterator, List, Optional, Union

 from openai import APIConnectionError, AsyncOpenAI, BadRequestError
@ -82,12 +83,42 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        #     )

        self._config = config
-        # make sure the client lives longer than any async calls
-        self._client = AsyncOpenAI(
-            base_url=f"{self._config.url}/v1",
-            api_key=(self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"),
-            timeout=self._config.timeout,
-        )
+
+    @lru_cache  # noqa: B019
+    def _get_client(self, provider_model_id: str) -> AsyncOpenAI:
+        """
+        For hosted models, https://integrate.api.nvidia.com/v1 is the primary base_url. However,
+        some models are hosted on different URLs. This function returns the appropriate client
+        for the given provider_model_id.
+
+        This relies on lru_cache and self._default_client to avoid creating a new client for each request
+        or for each model that is hosted on https://integrate.api.nvidia.com/v1.
+
+        :param provider_model_id: The provider model ID
+        :return: An OpenAI client
+        """
+
+        @lru_cache  # noqa: B019
+        def _get_client_for_base_url(base_url: str) -> AsyncOpenAI:
+            """
+            Maintain a single OpenAI client per base_url.
+            """
+            return AsyncOpenAI(
+                base_url=base_url,
+                api_key=(self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"),
+                timeout=self._config.timeout,
+            )
+
+        special_model_urls = {
+            "meta/llama-3.2-11b-vision-instruct": "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct",
+            "meta/llama-3.2-90b-vision-instruct": "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct",
+        }
+
+        base_url = f"{self._config.url}/v1"
+        if _is_nvidia_hosted(self._config) and provider_model_id in special_model_urls:
+            base_url = special_model_urls[provider_model_id]
+
+        return _get_client_for_base_url(base_url)

    async def completion(
        self,
@ -105,9 +136,10 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):

        await check_health(self._config)  # this raises errors

+        provider_model_id = self.get_provider_model_id(model_id)
        request = convert_completion_request(
            request=CompletionRequest(
-                model=self.get_provider_model_id(model_id),
+                model=provider_model_id,
                content=content,
                sampling_params=sampling_params,
                response_format=response_format,
@ -118,7 +150,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        )

        try:
-            response = await self._client.completions.create(**request)
+            response = await self._get_client(provider_model_id).completions.create(**request)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e

@ -206,6 +238,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):

        await check_health(self._config)  # this raises errors

+        provider_model_id = self.get_provider_model_id(model_id)
        request = await convert_chat_completion_request(
            request=ChatCompletionRequest(
                model=self.get_provider_model_id(model_id),
@ -221,7 +254,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        )

        try:
-            response = await self._client.chat.completions.create(**request)
+            response = await self._get_client(provider_model_id).chat.completions.create(**request)
        except APIConnectionError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e

--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -90,15 +90,12 @@ def _convert_to_vllm_tool_calls_in_response(
    if not tool_calls:
        return []

-    call_function_arguments = None
-    for call in tool_calls:
-        call_function_arguments = json.loads(call.function.arguments)
-
    return [
        ToolCall(
            call_id=call.id,
            tool_name=call.function.name,
-            arguments=call_function_arguments,
+            arguments=json.loads(call.function.arguments),
+            arguments_json=call.function.arguments,
        )
        for call in tool_calls
    ]
@ -183,6 +180,7 @@ async def _process_vllm_chat_completion_stream_response(
                                call_id=tool_call_buf.call_id,
                                tool_name=tool_call_buf.tool_name,
                                arguments=args,
+                                arguments_json=args_str,
                            ),
                            parse_status=ToolCallParseStatus.succeeded,
                        ),