From c71e2a0d872f8c12a2b3c9b9aba6ba4d13d1cfef Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 6 Mar 2025 18:26:53 +0000 Subject: [PATCH] add nvidia distribution --- .../providers/registry/post_training.py | 18 +++++----- .../remote/post_training/__init__.py | 5 +++ .../remote/post_training/nvidia/__init__.py | 2 +- .../remote/post_training/nvidia/config.py | 14 +++++++- llama_stack/templates/nvidia/build.yaml | 2 ++ llama_stack/templates/nvidia/nvidia.py | 33 +++++++++++++++++++ llama_stack/templates/nvidia/run.yaml | 5 +++ 7 files changed, 67 insertions(+), 12 deletions(-) create mode 100644 llama_stack/providers/remote/post_training/__init__.py diff --git a/llama_stack/providers/registry/post_training.py b/llama_stack/providers/registry/post_training.py index 286bbf866..b4b063144 100644 --- a/llama_stack/providers/registry/post_training.py +++ b/llama_stack/providers/registry/post_training.py @@ -6,7 +6,7 @@ from typing import List -from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec +from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec def available_providers() -> List[ProviderSpec]: @@ -22,15 +22,13 @@ def available_providers() -> List[ProviderSpec]: Api.datasets, ], ), - InlineProviderSpec( + remote_provider_spec( api=Api.post_training, - provider_type="remote::nvidia", - pip_packages=["torch", "numpy"], - module="llama_stack.providers.remote.post_training.nvidia", - config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig", - api_dependencies=[ - Api.datasetio, - Api.datasets, - ], + adapter=AdapterSpec( + adapter_type="nvidia", + pip_packages=["requests"], + module="llama_stack.providers.remote.post_training.nvidia", + config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig", + ), ), ] diff --git a/llama_stack/providers/remote/post_training/__init__.py b/llama_stack/providers/remote/post_training/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/remote/post_training/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/remote/post_training/nvidia/__init__.py b/llama_stack/providers/remote/post_training/nvidia/__init__.py index 580f8c62f..964e1fdaa 100644 --- a/llama_stack/providers/remote/post_training/nvidia/__init__.py +++ b/llama_stack/providers/remote/post_training/nvidia/__init__.py @@ -13,7 +13,7 @@ from .config import NvidiaPostTrainingConfig # post_training api and the torchtune provider is still experimental and under heavy development -async def get_provider_impl( +async def get_adapter_impl( config: NvidiaPostTrainingConfig, deps: Dict[Api, ProviderSpec], ): diff --git a/llama_stack/providers/remote/post_training/nvidia/config.py b/llama_stack/providers/remote/post_training/nvidia/config.py index 34dd30464..4e7341001 100644 --- a/llama_stack/providers/remote/post_training/nvidia/config.py +++ b/llama_stack/providers/remote/post_training/nvidia/config.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import os -from typing import Optional +from typing import Any, Dict, Optional from pydantic import BaseModel, Field @@ -58,3 +58,15 @@ class NvidiaPostTrainingConfig(BaseModel): default_factory=lambda: os.getenv("NVIDIA_OUTPUT_MODEL_DIR", "test-example-model@v1"), description="Directory to save the output model", ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "api_key": "${env.NVIDIA_API_KEY:}", + "user_id": "${env.NVIDIA_USER_ID:llama-stack-user}", + "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}", + "access_policies": "${env.NVIDIA_ACCESS_POLICIES:}", + "project_id": "${env.NVIDIA_PROJECT_ID:test-project}", + "customizer_url": "${env.NVIDIA_CUSTOMIZER_URL:}", + "output_model_dir": "${env.NVIDIA_OUTPUT_MODEL_DIR:test-example-model@v1}", + } diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index 0aff58836..24749ffcd 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -20,6 +20,8 @@ distribution_spec: - inline::basic - inline::llm-as-judge - inline::braintrust + post_training: + - remote::nvidia tool_runtime: - inline::rag-runtime image_type: conda diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index 308c0e2a6..a28de0dc2 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -9,6 +9,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES +from llama_stack.providers.remote.post_training.nvidia import NvidiaPostTrainingConfig from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry @@ -18,6 +19,7 @@ def get_distribution_template() -> DistributionTemplate: "inference": ["remote::nvidia"], "vector_io": ["inline::faiss"], "safety": ["remote::nvidia"], + "post_training": ["remote::nvidia"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], "eval": ["inline::meta-reference"], @@ -31,6 +33,12 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::nvidia", config=NVIDIAConfig.sample_run_config(), ) + + post_training_provider = Provider( + provider_id="nvidia", + provider_type="remote::nvidia", + config=NvidiaPostTrainingConfig.sample_run_config(), + ) safety_provider = Provider( provider_id="nvidia", provider_type="remote::nvidia", @@ -89,6 +97,31 @@ def get_distribution_template() -> DistributionTemplate: "", "NVIDIA API Key", ), + ## Nemo Customizer related variables + "NVIDIA_USER_ID": ( + "llama-stack-user", + "NVIDIA User ID", + ), + "NVIDIA_DATASET_NAMESPACE": ( + "default", + "NVIDIA Dataset Namespace", + ), + "NVIDIA_ACCESS_POLICIES": ( + "{}", + "NVIDIA Access Policies", + ), + "NVIDIA_PROJECT_ID": ( + "test-project", + "NVIDIA Project ID", + ), + "NVIDIA_CUSTOMIZER_URL": ( + "https://customizer.api.nvidia.com", + "NVIDIA Customizer URL", + ), + "NVIDIA_OUTPUT_MODEL_DIR": ( + "test-example-model@v1", + "NVIDIA Output Model Directory", + ), "GUARDRAILS_SERVICE_URL": ( "http://0.0.0.0:7331", "URL for the NeMo Guardrails Service", diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 4026d2c5d..8a7a40266 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -8,6 +8,7 @@ apis: - safety - scoring - telemetry +- post_training - tool_runtime - vector_io providers: @@ -73,6 +74,10 @@ providers: provider_type: inline::braintrust config: openai_api_key: ${env.OPENAI_API_KEY:} + post_training: + - provider_id: nvidia-customizer + provider_type: remote::nvidia + config: {} tool_runtime: - provider_id: rag-runtime provider_type: inline::rag-runtime