From b12cd528efbbf96b77a53910a196fb2854e32332 Mon Sep 17 00:00:00 2001
From: Jiayi Ni <jiayin@nvidia.com>
Date: Fri, 29 Aug 2025 16:23:52 -0700
Subject: [PATCH] docs: add VLM NIM example (#3277)

---
 .../self_hosted_distro/nvidia.md              |  1 +
 llama_stack/distributions/nvidia/run.yaml     |  5 ++
 .../remote/inference/nvidia/NVIDIA.md         | 60 +++++++++++++++++--
 .../remote/inference/nvidia/models.py         |  4 ++
 4 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md
index e845c3c48..86d025ce7 100644
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@@ -50,6 +50,7 @@ The following models are available by default:
 - `meta/llama-3.2-11b-vision-instruct `
 - `meta/llama-3.2-90b-vision-instruct `
 - `meta/llama-3.3-70b-instruct `
+- `nvidia/vila `
 - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
 - `nvidia/nv-embedqa-e5-v5 `
 - `nvidia/nv-embedqa-mistral-7b-v2 `
diff --git a/llama_stack/distributions/nvidia/run.yaml b/llama_stack/distributions/nvidia/run.yaml
index 8e915f586..9fd6b0404 100644
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@@ -134,6 +134,11 @@ models:
   provider_id: nvidia
   provider_model_id: meta/llama-3.3-70b-instruct
   model_type: llm
+- metadata: {}
+  model_id: nvidia/vila
+  provider_id: nvidia
+  provider_model_id: nvidia/vila
+  model_type: llm
 - metadata:
     embedding_dimension: 2048
     context_length: 8192
diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
index d96b29fef..d9c18533a 100644
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@@ -41,10 +41,10 @@ client.initialize()
 
 ### Create Completion
 
-> Note on Completion API
->
-> The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does.
+The following example shows how to create a completion for an NVIDIA NIM.
 
+> [!NOTE]
+> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.
 
 ```python
 response = client.inference.completion(
@@ -60,6 +60,8 @@ print(f"Response: {response.content}")
 
 ### Create Chat Completion
 
+The following example shows how to create a chat completion for an NVIDIA NIM.
+
 ```python
 response = client.inference.chat_completion(
     model_id="meta-llama/Llama-3.1-8B-Instruct",
@@ -82,6 +84,9 @@ print(f"Response: {response.completion_message.content}")
 ```
 
 ### Tool Calling Example ###
+
+The following example shows how to do tool calling for an NVIDIA NIM.
+
 ```python
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
 
@@ -117,6 +122,9 @@ if tool_response.completion_message.tool_calls:
 ```
 
 ### Structured Output Example
+
+The following example shows how to do structured output for an NVIDIA NIM.
+
 ```python
 from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
 
@@ -149,8 +157,10 @@ print(f"Structured Response: {structured_response.completion_message.content}")
 ```
 
 ### Create Embeddings
-> Note on OpenAI embeddings compatibility
->
+
+The following example shows how to create embeddings for an NVIDIA NIM.
+
+> [!NOTE]
 > NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
 
 ```python
@@ -160,4 +170,42 @@ response = client.inference.embeddings(
     task_type="query",
 )
 print(f"Embeddings: {response.embeddings}")
-```
\ No newline at end of file
+```
+
+### Vision Language Models Example
+
+The following example shows how to run vision inference by using an NVIDIA NIM.
+
+```python
+def load_image_as_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        img_bytes = image_file.read()
+        return base64.b64encode(img_bytes).decode("utf-8")
+
+
+image_path = {path_to_the_image}
+demo_image_b64 = load_image_as_base64(image_path)
+
+vlm_response = client.inference.chat_completion(
+    model_id="nvidia/vila",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": {
+                        "data": demo_image_b64,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": "Please describe what you see in this image in detail.",
+                },
+            ],
+        }
+    ],
+)
+
+print(f"VLM Response: {vlm_response.completion_message.content}")
+```
diff --git a/llama_stack/providers/remote/inference/nvidia/models.py b/llama_stack/providers/remote/inference/nvidia/models.py
index 76e579da8..df07f46b6 100644
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@@ -55,6 +55,10 @@ MODEL_ENTRIES = [
         "meta/llama-3.3-70b-instruct",
         CoreModelId.llama3_3_70b_instruct.value,
     ),
+    ProviderModelEntry(
+        provider_model_id="nvidia/vila",
+        model_type=ModelType.llm,
+    ),
     # NeMo Retriever Text Embedding models -
     #
     # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html