add NVIDIA NIM inference adapter

2025-12-16 12:42:36 +00:00 · 2024-10-22 14:31:11 -04:00 · 2024-10-22 14:31:11 -04:00 · 2dd8c4bcb6
commit 2dd8c4bcb6
parent ac93dd89cf
12 changed files with 1115 additions and 0 deletions
--- a/tests/nvidia/README.md
+++ b/tests/nvidia/README.md
@ -0,0 +1,26 @@
+# NVIDIA tests
+
+## Running tests
+
+**Install the required dependencies:**
+    ```bash
+    pip install pytest pytest-asyncio pytest-httpx
+    ```
+
+There are three modes for testing:
+
+1. Unit tests - this mode checks the provider functionality and does not require a network connection or running distribution
+
+    ```bash
+    pytest tests/nvidia/unit
+    ```
+
+2. Integration tests against hosted preview APIs - this mode checks the provider functionality against a live system and requires an API key. Get an API key by 0. going to https://build.nvidia.com, 1. selecting a Llama model, e.g. https://build.nvidia.com/meta/llama-3_1-8b-instruct, and 2. clicking "Get API Key". Store the API key in the `NVIDIA_API_KEY` environment variable.
+
+    ```bash
+    export NVIDIA_API_KEY=...
+
+    pytest tests/nvidia/integration --base-url https://integrate.api.nvidia.com
+    ```
+
+3. Integration tests against a running distribution - this mode checks the provider functionality in the context of a running distribution. This involves running a local NIM, see https://build.nvidia.com/meta/llama-3_1-8b-instruct?snippet_tab=Docker, and creating & configuring a distribution to use it. Details to come.
--- a/tests/nvidia/integration/conftest.py
+++ b/tests/nvidia/integration/conftest.py
@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+import pytest
+
+from llama_stack.apis.inference import Inference
+from llama_stack.providers.adapters.inference.nvidia import (
+    get_adapter_impl,
+    NVIDIAConfig,
+)
+
+
+def pytest_collection_modifyitems(config, items):
+    """
+    Skip all integration tests if NVIDIA_API_KEY is not set and --base-url
+    includes "https://integrate.api.nvidia.com". It is needed to access the
+    hosted preview APIs.
+    """
+    if "integrate.api.nvidia.com" in config.getoption(
+        "--base-url"
+    ) and not os.environ.get("NVIDIA_API_KEY"):
+        skip_nvidia = pytest.mark.skip(
+            reason="NVIDIA_API_KEY environment variable must be set to access integrate.api.nvidia.com"
+        )
+        for item in items:
+            item.add_marker(skip_nvidia)
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--base-url",
+        action="store",
+        default="http://localhost:8000",
+        help="Base URL for the tests",
+    )
+    parser.addoption(
+        "--model",
+        action="store",
+        default="Llama-3-8B-Instruct",
+        help="Model option for the tests",
+    )
+
+
+@pytest.fixture
+def base_url(request):
+    return request.config.getoption("--base-url")
+
+
+@pytest.fixture
+def model(request):
+    return request.config.getoption("--model")
+
+
+@pytest.fixture
+def client(base_url: str) -> Inference:
+    return get_adapter_impl(
+        NVIDIAConfig(
+            base_url=base_url,
+            api_key=os.environ.get("NVIDIA_API_KEY"),
+        ),
+        {},
+    )
--- a/tests/nvidia/integration/test_inference.py
+++ b/tests/nvidia/integration/test_inference.py
@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import itertools
+from typing import Generator, List, Tuple
+
+import pytest
+
+from llama_stack.apis.inference import (
+    ChatCompletionResponse,
+    CompletionMessage,
+    Inference,
+    Message,
+    StopReason,
+    SystemMessage,
+    ToolResponseMessage,
+    UserMessage,
+)
+from llama_stack.providers.adapters.inference.nvidia import (
+    get_adapter_impl,
+    NVIDIAConfig,
+)
+
+pytestmark = pytest.mark.asyncio
+
+
+# TODO(mf): test bad creds raises PermissionError
+# TODO(mf): test bad params, e.g. max_tokens=0 raises ValidationError
+# TODO(mf): test bad model name raises ValueError
+# TODO(mf): test short timeout raises TimeoutError
+# TODO(mf): new file, test cli model listing
+# TODO(mf): test streaming
+# TODO(mf): test tool calls w/ tool_choice
+
+
+def message_combinations(
+    length: int,
+) -> Generator[Tuple[List[Message], str], None, None]:
+    """
+    Generate all possible combinations of message types of given length.
+    """
+    message_types = [
+        UserMessage,
+        SystemMessage,
+        ToolResponseMessage,
+        CompletionMessage,
+    ]
+    for count in range(1, length + 1):
+        for combo in itertools.product(message_types, repeat=count):
+            messages = []
+            for i, msg in enumerate(combo):
+                if msg == ToolResponseMessage:
+                    messages.append(
+                        msg(
+                            content=f"Message {i + 1}",
+                            call_id=f"call_{i + 1}",
+                            tool_name=f"tool_{i + 1}",
+                        )
+                    )
+                elif msg == CompletionMessage:
+                    messages.append(
+                        msg(content=f"Message {i + 1}", stop_reason="end_of_message")
+                    )
+                else:
+                    messages.append(msg(content=f"Message {i + 1}"))
+            id_str = "-".join([msg.__name__ for msg in combo])
+            yield messages, id_str
+
+
+@pytest.mark.parametrize("combo", message_combinations(3), ids=lambda x: x[1])
+async def test_chat_completion_messages(
+    client: Inference,
+    model: str,
+    combo: Tuple[List[Message], str],
+):
+    """
+    Test the chat completion endpoint with different message combinations.
+    """
+    client = await client
+    messages, _ = combo
+
+    response = await client.chat_completion(
+        model=model,
+        messages=messages,
+        stream=False,
+    )
+
+    assert isinstance(response, ChatCompletionResponse)
+    assert isinstance(response.completion_message.content, str)
+    # we're not testing accuracy, so no assertions on the result.completion_message.content
+    assert response.completion_message.role == "assistant"
+    assert isinstance(response.completion_message.stop_reason, StopReason)
+    assert response.completion_message.tool_calls == []
+
+
+async def test_bad_base_url(
+    model: str,
+):
+    """
+    Test that a bad base_url raises a ConnectionError.
+    """
+    client = await get_adapter_impl(
+        NVIDIAConfig(
+            base_url="http://localhost:32123",
+        ),
+        {},
+    )
+
+    with pytest.raises(ConnectionError):
+        await client.chat_completion(
+            model=model,
+            messages=[UserMessage(content="Hello")],
+            stream=False,
+        )
--- a/tests/nvidia/unit/conftest.py
+++ b/tests/nvidia/unit/conftest.py
@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+import pytest
+
+from llama_stack.apis.inference import Inference
+from llama_stack.providers.adapters.inference.nvidia import (
+    get_adapter_impl,
+    NVIDIAConfig,
+)
+from pytest_httpx import HTTPXMock
+
+pytestmark = pytest.mark.asyncio
+
+
+@pytest.fixture
+def base_url():
+    return "http://endpoint.mocked"
+
+
+@pytest.fixture
+def client(base_url: str) -> Inference:
+    return get_adapter_impl(
+        NVIDIAConfig(
+            base_url=base_url,
+            api_key=os.environ.get("NVIDIA_API_KEY"),
+        ),
+        {},
+    )
+
+
+@pytest.fixture
+def mock_health(
+    httpx_mock: HTTPXMock,
+    base_url: str,
+) -> HTTPXMock:
+    for path in [
+        "/v1/health/live",
+        "/v1/health/ready",
+    ]:
+        httpx_mock.add_response(
+            url=f"{base_url}{path}",
+            status_code=200,
+        )
+    return httpx_mock
+
+
+@pytest.fixture
+def mock_chat_completion(httpx_mock: HTTPXMock, base_url: str) -> HTTPXMock:
+    httpx_mock.add_response(
+        url=f"{base_url}/v1/chat/completions",
+        json={
+            "id": "mock-id",
+            "created": 1234567890,
+            "object": "chat.completion",
+            "model": "mock-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "WORKED"},
+                    "finish_reason": "length",
+                }
+            ],
+        },
+        status_code=200,
+    )
+
+    return httpx_mock
--- a/tests/nvidia/unit/test_chat_completion.py
+++ b/tests/nvidia/unit/test_chat_completion.py
@ -0,0 +1,203 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from llama_models.llama3.api.datatypes import TokenLogProbs, ToolCall
+
+from llama_stack.apis.inference import Inference
+from pytest_httpx import HTTPXMock
+
+pytestmark = pytest.mark.asyncio
+
+
+async def test_content(
+    mock_health: HTTPXMock,
+    httpx_mock: HTTPXMock,
+    client: Inference,
+    base_url: str,
+) -> None:
+    """
+    Test that response content makes it through to the completion message.
+    """
+    httpx_mock.add_response(
+        url=f"{base_url}/v1/chat/completions",
+        json={
+            "id": "mock-id",
+            "created": 1234567890,
+            "object": "chat.completion",
+            "model": "mock-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "RESPONSE"},
+                    "finish_reason": "length",
+                }
+            ],
+        },
+        status_code=200,
+    )
+
+    client = await client
+
+    response = await client.chat_completion(
+        model="Llama-3-8B-Instruct",
+        messages=[{"role": "user", "content": "BOGUS"}],
+        stream=False,
+    )
+    assert response.completion_message.content == "RESPONSE"
+
+
+async def test_logprobs(
+    mock_health: HTTPXMock,
+    httpx_mock: HTTPXMock,
+    client: Inference,
+    base_url: str,
+) -> None:
+    """
+    Test that logprobs are parsed correctly.
+    """
+    httpx_mock.add_response(
+        url=f"{base_url}/v1/chat/completions",
+        json={
+            "id": "mock-id",
+            "object": "chat.completion",
+            "created": 1234567890,
+            "model": "mock-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "Hello there"},
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": "Hello",
+                                "logprob": -0.1,
+                                "bytes": [72, 101, 108, 108, 111],
+                                "top_logprobs": [
+                                    {"token": "Hello", "logprob": -0.1},
+                                    {"token": "Hi", "logprob": -1.2},
+                                    {"token": "Greetings", "logprob": -2.1},
+                                ],
+                            },
+                            {
+                                "token": "there",
+                                "logprob": -0.2,
+                                "bytes": [116, 104, 101, 114, 101],
+                                "top_logprobs": [
+                                    {"token": "there", "logprob": -0.2},
+                                    {"token": "here", "logprob": -1.3},
+                                    {"token": "where", "logprob": -2.2},
+                                ],
+                            },
+                        ]
+                    },
+                    "finish_reason": "length",
+                }
+            ],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+        },
+        status_code=200,
+    )
+
+    client = await client
+
+    response = await client.chat_completion(
+        model="Llama-3-8B-Instruct",
+        messages=[{"role": "user", "content": "Hello"}],
+        logprobs={"top_k": 3},
+        stream=False,
+    )
+
+    assert response.logprobs == [
+        TokenLogProbs(
+            logprobs_by_token={
+                "Hello": -0.1,
+                "Hi": -1.2,
+                "Greetings": -2.1,
+            }
+        ),
+        TokenLogProbs(
+            logprobs_by_token={
+                "there": -0.2,
+                "here": -1.3,
+                "where": -2.2,
+            }
+        ),
+    ]
+
+
+async def test_tools(
+    mock_health: HTTPXMock,
+    httpx_mock: HTTPXMock,
+    client: Inference,
+    base_url: str,
+) -> None:
+    """
+    Test that tools are passed correctly.
+    """
+    httpx_mock.add_response(
+        url=f"{base_url}/v1/chat/completions",
+        json={
+            "id": "mock-id",
+            "object": "chat.completion",
+            "created": 1234567890,
+            "model": "mock-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "tool-id",
+                                "type": "function",
+                                "function": {
+                                    "name": "magic",
+                                    "arguments": {"input": 3},
+                                },
+                            },
+                            {
+                                "id": "tool-id!",
+                                "type": "function",
+                                "function": {
+                                    "name": "magic!",
+                                    "arguments": {"input": 42},
+                                },
+                            },
+                        ],
+                    },
+                    "logprobs": None,
+                    "finish_reason": "tool_calls",
+                }
+            ],
+        },
+        status_code=200,
+    )
+
+    client = await client
+
+    response = await client.chat_completion(
+        model="Llama-3-8B-Instruct",
+        messages=[{"role": "user", "content": "Hello"}],
+        stream=False,
+    )
+
+    assert response.completion_message.tool_calls == [
+        ToolCall(
+            call_id="tool-id",
+            tool_name="magic",
+            arguments={"input": 3},
+        ),
+        ToolCall(
+            call_id="tool-id!",
+            tool_name="magic!",
+            arguments={"input": 42},
+        ),
+    ]
+
+
+# TODO(mf): test stream=True for each case
--- a/tests/nvidia/unit/test_health.py
+++ b/tests/nvidia/unit/test_health.py
@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+
+from llama_stack.apis.inference import Inference
+from pytest_httpx import HTTPXMock
+
+pytestmark = pytest.mark.asyncio
+
+
+async def test_chat_completion(
+    mock_health: HTTPXMock,
+    mock_chat_completion: HTTPXMock,
+    client: Inference,
+    base_url: str,
+) -> None:
+    """
+    Test that health endpoints are checked when chat_completion is called.
+    """
+    client = await client
+
+    await client.chat_completion(
+        model="Llama-3-8B-Instruct",
+        messages=[{"role": "user", "content": "BOGUS"}],
+        stream=False,
+    )
+
+
+# TODO(mf): test stream=True for each case
+# TODO(mf): test completion
+# TODO(mf): test embedding
--- a/tests/nvidia/unit/test_import.py
+++ b/tests/nvidia/unit/test_import.py
@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.adapters.inference.nvidia import __all__
+
+
+def test_import():
+    assert set(__all__) == {"get_adapter_impl", "NVIDIAConfig"}