add test cases

2025-12-31 07:33:51 +00:00 · 2025-03-08 11:43:13 +00:00 · 2025-03-08 11:43:13 +00:00 · 409383ae5f
commit 409383ae5f
parent 6a0c38f123
5 changed files with 425 additions and 29 deletions
--- a/llama_stack/providers/remote/post_training/nvidia/utils.py
+++ b/llama_stack/providers/remote/post_training/nvidia/utils.py
@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, IAny, nc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import logging
+from typing import Tuple
+
+import httpx
+
+from .config import NvidiaPostTrainingConfig
+
+logger = logging.getLogger(__name__)
+
+
+async def _get_health(url: str) -> Tuple[bool, bool]:
+    """
+    Query {url}/v1/health/{live,ready} to check if the server is running and ready
+
+    Args:
+        url (str): URL of the server
+
+    Returns:
+        Tuple[bool, bool]: (is_live, is_ready)
+    """
+    async with httpx.AsyncClient() as client:
+        live = await client.get(f"{url}/v1/health/live")
+        ready = await client.get(f"{url}/v1/health/ready")
+        return live.status_code == 200, ready.status_code == 200
+
+
+async def check_health(config: NvidiaPostTrainingConfig) -> None:
+    """
+    Check if the server is running and ready
+
+    Args:
+        url (str): URL of the server
+
+    Raises:
+        RuntimeError: If the server is not running or ready
+    """
+    if not _is_nvidia_hosted(config):
+        logger.info("Checking NVIDIA NIM health...")
+        try:
+            is_live, is_ready = await _get_health(config.url)
+            if not is_live:
+                raise ConnectionError("NVIDIA NIM is not running")
+            if not is_ready:
+                raise ConnectionError("NVIDIA NIM is not ready")
+            # TODO(mf): should we wait for the server to be ready?
+        except httpx.ConnectError as e:
+            raise ConnectionError(f"Failed to connect to NVIDIA NIM: {e}") from e