diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md
index 6e399e6ce..e845c3c48 100644
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@@ -157,7 +157,7 @@ docker run \
 If you've set up your local development environment, you can also build the image using your local virtual environment.
 
 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
   --port 8321 \
diff --git a/llama_stack/distributions/nvidia/doc_template.md b/llama_stack/distributions/nvidia/doc_template.md
index 3884e6b51..56e99e523 100644
--- a/llama_stack/distributions/nvidia/doc_template.md
+++ b/llama_stack/distributions/nvidia/doc_template.md
@@ -129,7 +129,7 @@ docker run \
 If you've set up your local development environment, you can also build the image using your local virtual environment.
 
 ```bash
-INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
+INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 llama stack build --distro nvidia --image-type venv
 llama stack run ./run.yaml \
   --port 8321 \
diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
index 2505718e0..4a072215c 100644
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@@ -42,8 +42,8 @@ client.initialize()
 ### Create Completion
 
 ```python
-response = client.completion(
-    model_id="meta-llama/Llama-3.1-8b-Instruct",
+response = client.inference.completion(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
     content="Complete the sentence using one word: Roses are red, violets are :",
     stream=False,
     sampling_params={
@@ -56,8 +56,8 @@ print(f"Response: {response.content}")
 ### Create Chat Completion
 
 ```python
-response = client.chat_completion(
-    model_id="meta-llama/Llama-3.1-8b-Instruct",
+response = client.inference.chat_completion(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
     messages=[
         {
             "role": "system",
@@ -78,8 +78,10 @@ print(f"Response: {response.completion_message.content}")
 
 ### Create Embeddings
 ```python
-response = client.embeddings(
-    model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
+response = client.inference.embeddings(
+    model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
+    contents=["What is the capital of France?"],
+    task_type="query",
 )
 print(f"Embeddings: {response.embeddings}")
-```
+```
\ No newline at end of file