Merge branch 'main' into add-watsonx-inference-adapter

2025-12-29 10:42:22 +00:00 · 2025-04-17 23:45:57 +05:30 · 2025-04-17 23:45:57 +05:30 · c407f3c340
commit c407f3c340
parent efe5b124f3 8bd6665775
28 changed files with 1786 additions and 1354 deletions
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -0,0 +1,85 @@
+# NVIDIA Inference Provider for LlamaStack
+
+This provider enables running inference using NVIDIA NIM.
+
+## Features
+- Endpoints for completions, chat completions, and embeddings for registered models
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to NVIDIA NIM deployment
+- NIM for model to use for inference is deployed
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = (
+    ""  # Required if using hosted NIM endpoint. If self-hosted, not required.
+)
+os.environ["NVIDIA_BASE_URL"] = "http://nim.test"  # NIM URL
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+### Create Completion
+
+```python
+response = client.completion(
+    model_id="meta-llama/Llama-3.1-8b-Instruct",
+    content="Complete the sentence using one word: Roses are red, violets are :",
+    stream=False,
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(f"Response: {response.content}")
+```
+
+### Create Chat Completion
+
+```python
+response = client.chat_completion(
+    model_id="meta-llama/Llama-3.1-8b-Instruct",
+    messages=[
+        {
+            "role": "system",
+            "content": "You must respond to each message with only one word",
+        },
+        {
+            "role": "user",
+            "content": "Complete the sentence using one word: Roses are red, violets are:",
+        },
+    ],
+    stream=False,
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(f"Response: {response.completion_message.content}")
+```
+
+### Create Embeddings
+```python
+response = client.embeddings(
+    model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
+)
+print(f"Embeddings: {response.embeddings}")
+```
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -48,6 +48,10 @@ MODEL_ENTRIES = [
        "meta/llama-3.2-90b-vision-instruct",
        CoreModelId.llama3_2_90b_vision_instruct.value,
    ),
+    build_hf_repo_model_entry(
+        "meta/llama-3.3-70b-instruct",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
    # NeMo Retriever Text Embedding models -
    #
    # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html