docs: Documentation update for NVIDIA Inference Provider (#3840)

# What does this PR do?    - Fix examples in the NVIDIA inference documentation to align with current API requirements. ## Test Plan  N/A
2025-12-06 10:37:22 +00:00 · 2025-10-20 09:51:43 -07:00 · 2025-10-20 09:51:43 -07:00 · 165b8b07f4
commit 165b8b07f4
parent f675fdda0f
2 changed files with 34 additions and 47 deletions
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -45,7 +45,7 @@ The following example shows how to create a chat completion for an NVIDIA NIM.
 ```python
 response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[
        {
            "role": "system",
@ -67,37 +67,40 @@ print(f"Response: {response.choices[0].message.content}")
 The following example shows how to do tool calling for an NVIDIA NIM.
 ```python
-from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+tool_definition = {
-
+    "type": "function",
-tool_definition = ToolDefinition(
+    "function": {
-    tool_name="get_weather",
+        "name": "get_weather",
-    description="Get current weather information for a location",
+        "description": "Get current weather information for a location",
-    parameters={
+        "parameters": {
-        "location": ToolParamDefinition(
+            "type": "object",
-            param_type="string",
+            "properties": {
-            description="The city and state, e.g. San Francisco, CA",
+                "location": {
-            required=True,
+                    "type": "string",
-        ),
+                    "description": "The city and state, e.g. San Francisco, CA",
-        "unit": ToolParamDefinition(
+                },
-            param_type="string",
+                "unit": {
-            description="Temperature unit (celsius or fahrenheit)",
+                    "type": "string",
-            required=False,
+                    "description": "Temperature unit (celsius or fahrenheit)",
-            default="celsius",
+                    "default": "celsius",
-        ),
+                },
            },
            "required": ["location"],
        },
    },
-)
+}
 tool_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
    tools=[tool_definition],
 )
-print(f"Tool Response: {tool_response.choices[0].message.content}")
+print(f"Response content: {tool_response.choices[0].message.content}")
 if tool_response.choices[0].message.tool_calls:
    for tool_call in tool_response.choices[0].message.tool_calls:
-        print(f"Tool Called: {tool_call.tool_name}")
+        print(f"Tool Called: {tool_call.function.name}")
-        print(f"Arguments: {tool_call.arguments}")
+        print(f"Arguments: {tool_call.function.arguments}")
 ```
 ### Structured Output Example
@ -105,33 +108,26 @@ if tool_response.choices[0].message.tool_calls:
 The following example shows how to do structured output for an NVIDIA NIM.
 ```python
 from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
 person_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
-        "age": {"type": "integer"},
+        "age": {"type": "number"},
        "occupation": {"type": "string"},
    },
    "required": ["name", "age", "occupation"],
 }
 response_format = JsonSchemaResponseFormat(
    type=ResponseFormatType.json_schema, json_schema=person_schema
 )
 structured_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
        }
    ],
-    response_format=response_format,
+    extra_body={"nvext": {"guided_json": person_schema}},
 )
 print(f"Structured Response: {structured_response.choices[0].message.content}")
 ```
@ -141,7 +137,7 @@ The following example shows how to create embeddings for an NVIDIA NIM.
 ```python
 response = client.embeddings.create(
-    model="nvidia/llama-3.2-nv-embedqa-1b-v2",
+    model="nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2",
    input=["What is the capital of France?"],
    extra_body={"input_type": "query"},
 )
@ -163,15 +159,15 @@ image_path = {path_to_the_image}
 demo_image_b64 = load_image_as_base64(image_path)
 vlm_response = client.chat.completions.create(
-    model="nvidia/vila",
+    model="nvidia/meta/llama-3.2-11b-vision-instruct",
    messages=[
        {
            "role": "user",
            "content": [
                {
-                    "type": "image",
+                    "type": "image_url",
-                    "image": {
+                    "image_url": {
-                        "data": demo_image_b64,
+                        "url": f"data:image/png;base64,{demo_image_b64}",
                    },
                },
                {
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -19,15 +19,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
    """
    NVIDIA Inference Adapter for Llama Stack.
    Note: The inheritance order is important here. OpenAIMixin must come before
    ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability()
    is used instead of ModelRegistryHelper.check_model_availability(). It also
    must come before Inference to ensure that OpenAIMixin methods are available
    in the Inference interface.
    - OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists
    - ModelRegistryHelper.check_model_availability() just returns False and shows a warning
    """
    # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html