docs: Documentation update for NVIDIA Inference Provider (#3840)

# What does this PR do?    - Fix examples in the NVIDIA inference documentation to align with current API requirements. ## Test Plan  N/A
2025-12-10 11:39:47 +00:00 · 2025-10-20 09:51:43 -07:00 · 2025-10-20 09:51:43 -07:00 · 165b8b07f4
commit 165b8b07f4
parent f675fdda0f
2 changed files with 34 additions and 47 deletions
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -45,7 +45,7 @@ The following example shows how to create a chat completion for an NVIDIA NIM.

 ```python
 response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[
        {
            "role": "system",
@ -67,37 +67,40 @@ print(f"Response: {response.choices[0].message.content}")
 The following example shows how to do tool calling for an NVIDIA NIM.

 ```python
-from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-
-tool_definition = ToolDefinition(
-    tool_name="get_weather",
-    description="Get current weather information for a location",
-    parameters={
-        "location": ToolParamDefinition(
-            param_type="string",
-            description="The city and state, e.g. San Francisco, CA",
-            required=True,
-        ),
-        "unit": ToolParamDefinition(
-            param_type="string",
-            description="Temperature unit (celsius or fahrenheit)",
-            required=False,
-            default="celsius",
-        ),
+tool_definition = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current weather information for a location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "Temperature unit (celsius or fahrenheit)",
+                    "default": "celsius",
+                },
+            },
+            "required": ["location"],
+        },
    },
-)
+}

 tool_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
    tools=[tool_definition],
 )

-print(f"Tool Response: {tool_response.choices[0].message.content}")
+print(f"Response content: {tool_response.choices[0].message.content}")
 if tool_response.choices[0].message.tool_calls:
    for tool_call in tool_response.choices[0].message.tool_calls:
-        print(f"Tool Called: {tool_call.tool_name}")
-        print(f"Arguments: {tool_call.arguments}")
+        print(f"Tool Called: {tool_call.function.name}")
+        print(f"Arguments: {tool_call.function.arguments}")
 ```

 ### Structured Output Example
@ -105,33 +108,26 @@ if tool_response.choices[0].message.tool_calls:
 The following example shows how to do structured output for an NVIDIA NIM.

 ```python
-from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
-
 person_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
-        "age": {"type": "integer"},
+        "age": {"type": "number"},
        "occupation": {"type": "string"},
    },
    "required": ["name", "age", "occupation"],
 }

-response_format = JsonSchemaResponseFormat(
-    type=ResponseFormatType.json_schema, json_schema=person_schema
-)
-
 structured_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
        }
    ],
-    response_format=response_format,
+    extra_body={"nvext": {"guided_json": person_schema}},
 )
-
 print(f"Structured Response: {structured_response.choices[0].message.content}")
 ```

@ -141,7 +137,7 @@ The following example shows how to create embeddings for an NVIDIA NIM.

 ```python
 response = client.embeddings.create(
-    model="nvidia/llama-3.2-nv-embedqa-1b-v2",
+    model="nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2",
    input=["What is the capital of France?"],
    extra_body={"input_type": "query"},
 )
@ -163,15 +159,15 @@ image_path = {path_to_the_image}
 demo_image_b64 = load_image_as_base64(image_path)

 vlm_response = client.chat.completions.create(
-    model="nvidia/vila",
+    model="nvidia/meta/llama-3.2-11b-vision-instruct",
    messages=[
        {
            "role": "user",
            "content": [
                {
-                    "type": "image",
-                    "image": {
-                        "data": demo_image_b64,
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{demo_image_b64}",
                    },
                },
                {
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -19,15 +19,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin):

    """
    NVIDIA Inference Adapter for Llama Stack.
-
-    Note: The inheritance order is important here. OpenAIMixin must come before
-    ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability()
-    is used instead of ModelRegistryHelper.check_model_availability(). It also
-    must come before Inference to ensure that OpenAIMixin methods are available
-    in the Inference interface.
-
-    - OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists
-    - ModelRegistryHelper.check_model_availability() just returns False and shows a warning
    """

    # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html