diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
index 096ff28ac..692b9125b 100644
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@@ -45,7 +45,7 @@ The following example shows how to create a chat completion for an NVIDIA NIM.
 
 ```python
 response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
     messages=[
         {
             "role": "system",
@@ -67,37 +67,40 @@ print(f"Response: {response.choices[0].message.content}")
 The following example shows how to do tool calling for an NVIDIA NIM.
 
 ```python
-from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
-
-tool_definition = ToolDefinition(
-    tool_name="get_weather",
-    description="Get current weather information for a location",
-    parameters={
-        "location": ToolParamDefinition(
-            param_type="string",
-            description="The city and state, e.g. San Francisco, CA",
-            required=True,
-        ),
-        "unit": ToolParamDefinition(
-            param_type="string",
-            description="Temperature unit (celsius or fahrenheit)",
-            required=False,
-            default="celsius",
-        ),
+tool_definition = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current weather information for a location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "Temperature unit (celsius or fahrenheit)",
+                    "default": "celsius",
+                },
+            },
+            "required": ["location"],
+        },
     },
-)
+}
 
 tool_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
     messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
     tools=[tool_definition],
 )
 
-print(f"Tool Response: {tool_response.choices[0].message.content}")
+print(f"Response content: {tool_response.choices[0].message.content}")
 if tool_response.choices[0].message.tool_calls:
     for tool_call in tool_response.choices[0].message.tool_calls:
-        print(f"Tool Called: {tool_call.tool_name}")
-        print(f"Arguments: {tool_call.arguments}")
+        print(f"Tool Called: {tool_call.function.name}")
+        print(f"Arguments: {tool_call.function.arguments}")
 ```
 
 ### Structured Output Example
@@ -105,33 +108,26 @@ if tool_response.choices[0].message.tool_calls:
 The following example shows how to do structured output for an NVIDIA NIM.
 
 ```python
-from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
-
 person_schema = {
     "type": "object",
     "properties": {
         "name": {"type": "string"},
-        "age": {"type": "integer"},
+        "age": {"type": "number"},
         "occupation": {"type": "string"},
     },
     "required": ["name", "age", "occupation"],
 }
 
-response_format = JsonSchemaResponseFormat(
-    type=ResponseFormatType.json_schema, json_schema=person_schema
-)
-
 structured_response = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="nvidia/meta/llama-3.1-8b-instruct",
     messages=[
         {
             "role": "user",
             "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
         }
     ],
-    response_format=response_format,
+    extra_body={"nvext": {"guided_json": person_schema}},
 )
-
 print(f"Structured Response: {structured_response.choices[0].message.content}")
 ```
 
@@ -141,7 +137,7 @@ The following example shows how to create embeddings for an NVIDIA NIM.
 
 ```python
 response = client.embeddings.create(
-    model="nvidia/llama-3.2-nv-embedqa-1b-v2",
+    model="nvidia/nvidia/llama-3.2-nv-embedqa-1b-v2",
     input=["What is the capital of France?"],
     extra_body={"input_type": "query"},
 )
@@ -163,15 +159,15 @@ image_path = {path_to_the_image}
 demo_image_b64 = load_image_as_base64(image_path)
 
 vlm_response = client.chat.completions.create(
-    model="nvidia/vila",
+    model="nvidia/meta/llama-3.2-11b-vision-instruct",
     messages=[
         {
             "role": "user",
             "content": [
                 {
-                    "type": "image",
-                    "image": {
-                        "data": demo_image_b64,
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{demo_image_b64}",
                     },
                 },
                 {
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index eab665d63..5aba6bddc 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -19,15 +19,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
 
     """
     NVIDIA Inference Adapter for Llama Stack.
-
-    Note: The inheritance order is important here. OpenAIMixin must come before
-    ModelRegistryHelper to ensure that OpenAIMixin.check_model_availability()
-    is used instead of ModelRegistryHelper.check_model_availability(). It also
-    must come before Inference to ensure that OpenAIMixin methods are available
-    in the Inference interface.
-
-    - OpenAIMixin.check_model_availability() queries the NVIDIA API to check if a model exists
-    - ModelRegistryHelper.check_model_availability() just returns False and shows a warning
     """
 
     # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html