From b12cd528efbbf96b77a53910a196fb2854e32332 Mon Sep 17 00:00:00 2001 From: Jiayi Ni Date: Fri, 29 Aug 2025 16:23:52 -0700 Subject: [PATCH] docs: add VLM NIM example (#3277) --- .../self_hosted_distro/nvidia.md | 1 + llama_stack/distributions/nvidia/run.yaml | 5 ++ .../remote/inference/nvidia/NVIDIA.md | 60 +++++++++++++++++-- .../remote/inference/nvidia/models.py | 4 ++ 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md index e845c3c48..86d025ce7 100644 --- a/docs/source/distributions/self_hosted_distro/nvidia.md +++ b/docs/source/distributions/self_hosted_distro/nvidia.md @@ -50,6 +50,7 @@ The following models are available by default: - `meta/llama-3.2-11b-vision-instruct ` - `meta/llama-3.2-90b-vision-instruct ` - `meta/llama-3.3-70b-instruct ` +- `nvidia/vila ` - `nvidia/llama-3.2-nv-embedqa-1b-v2 ` - `nvidia/nv-embedqa-e5-v5 ` - `nvidia/nv-embedqa-mistral-7b-v2 ` diff --git a/llama_stack/distributions/nvidia/run.yaml b/llama_stack/distributions/nvidia/run.yaml index 8e915f586..9fd6b0404 100644 --- a/llama_stack/distributions/nvidia/run.yaml +++ b/llama_stack/distributions/nvidia/run.yaml @@ -134,6 +134,11 @@ models: provider_id: nvidia provider_model_id: meta/llama-3.3-70b-instruct model_type: llm +- metadata: {} + model_id: nvidia/vila + provider_id: nvidia + provider_model_id: nvidia/vila + model_type: llm - metadata: embedding_dimension: 2048 context_length: 8192 diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md index d96b29fef..d9c18533a 100644 --- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md +++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md @@ -41,10 +41,10 @@ client.initialize() ### Create Completion -> Note on Completion API -> -> The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does. +The following example shows how to create a completion for an NVIDIA NIM. +> [!NOTE] +> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do. ```python response = client.inference.completion( @@ -60,6 +60,8 @@ print(f"Response: {response.content}") ### Create Chat Completion +The following example shows how to create a chat completion for an NVIDIA NIM. + ```python response = client.inference.chat_completion( model_id="meta-llama/Llama-3.1-8B-Instruct", @@ -82,6 +84,9 @@ print(f"Response: {response.completion_message.content}") ``` ### Tool Calling Example ### + +The following example shows how to do tool calling for an NVIDIA NIM. + ```python from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition @@ -117,6 +122,9 @@ if tool_response.completion_message.tool_calls: ``` ### Structured Output Example + +The following example shows how to do structured output for an NVIDIA NIM. + ```python from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType @@ -149,8 +157,10 @@ print(f"Structured Response: {structured_response.completion_message.content}") ``` ### Create Embeddings -> Note on OpenAI embeddings compatibility -> + +The following example shows how to create embeddings for an NVIDIA NIM. + +> [!NOTE] > NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`. ```python @@ -160,4 +170,42 @@ response = client.inference.embeddings( task_type="query", ) print(f"Embeddings: {response.embeddings}") -``` \ No newline at end of file +``` + +### Vision Language Models Example + +The following example shows how to run vision inference by using an NVIDIA NIM. + +```python +def load_image_as_base64(image_path): + with open(image_path, "rb") as image_file: + img_bytes = image_file.read() + return base64.b64encode(img_bytes).decode("utf-8") + + +image_path = {path_to_the_image} +demo_image_b64 = load_image_as_base64(image_path) + +vlm_response = client.inference.chat_completion( + model_id="nvidia/vila", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "image": { + "data": demo_image_b64, + }, + }, + { + "type": "text", + "text": "Please describe what you see in this image in detail.", + }, + ], + } + ], +) + +print(f"VLM Response: {vlm_response.completion_message.content}") +``` diff --git a/llama_stack/providers/remote/inference/nvidia/models.py b/llama_stack/providers/remote/inference/nvidia/models.py index 76e579da8..df07f46b6 100644 --- a/llama_stack/providers/remote/inference/nvidia/models.py +++ b/llama_stack/providers/remote/inference/nvidia/models.py @@ -55,6 +55,10 @@ MODEL_ENTRIES = [ "meta/llama-3.3-70b-instruct", CoreModelId.llama3_3_70b_instruct.value, ), + ProviderModelEntry( + provider_model_id="nvidia/vila", + model_type=ModelType.llm, + ), # NeMo Retriever Text Embedding models - # # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html