From 60bf0eb53207775919dbdf1e39d2d34096c1117d Mon Sep 17 00:00:00 2001 From: raspawar Date: Wed, 9 Apr 2025 12:31:46 +0000 Subject: [PATCH] datastore documentation --- .../remote/datasetio/nvidia/README.md | 75 +++++++++++++++++++ .../remote/datasetio/nvidia/config.py | 2 +- .../templates/nvidia/run-with-safety.yaml | 2 +- llama_stack/templates/nvidia/run.yaml | 2 +- 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/remote/datasetio/nvidia/README.md b/llama_stack/providers/remote/datasetio/nvidia/README.md index e69de29bb..699894ca5 100644 --- a/llama_stack/providers/remote/datasetio/nvidia/README.md +++ b/llama_stack/providers/remote/datasetio/nvidia/README.md @@ -0,0 +1,75 @@ +# NVIDIA DatasetIO Provider for LlamaStack + +This provider enables dataset management using NVIDIA's NeMo Customizer service. + +## Features + +- Register datasets for fine-tuning LLMs +- Unregister datasets + +## Getting Started + +### Prerequisites + +- LlamaStack with NVIDIA configuration +- Access to Hosted NVIDIA NeMo Microservice +- API key for authentication with the NVIDIA service + +### Setup + +Build the NVIDIA environment: + +```bash +llama stack build --template nvidia --image-type conda +``` + +### Basic Usage using the LlamaStack Python Client + +#### Initialize the client + +```python +import os +os.environ["NVIDIA_API_KEY"] = "your-api-key" +os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test" +os.environ["NVIDIA_USER_ID"] = "llama-stack-user" +os.environ["NVIDIA_DATASET_NAMESPACE"] = "default" +os.environ["NVIDIA_PROJECT_ID"] = "test-project" +from llama_stack.distribution.library_client import LlamaStackAsLibraryClient +client = LlamaStackAsLibraryClient("nvidia") +client.initialize() +``` + +#### Register a dataset + +```python +client.datasets.register( +purpose="post-training/messages", +dataset_id="my-training-dataset", +source={ +"type": "uri", +"uri": "hf://datasets/default/sample-dataset" +}, +metadata={ +"format": "json", +"description": "Dataset for LLM fine-tuning", +"provider": "nvidia" +} +) +``` + +#### Get a list of all registered datasets + +```python +datasets = client.datasets.list() +for dataset in datasets: + print(f"Dataset ID: {dataset.identifier}") + print(f"Description: {dataset.metadata.get('description', '')}") + print(f"Source: {dataset.source.uri}") + print("---") +``` + +#### Unregister a dataset + +```python +client.datasets.unregister(dataset_id="my-training-dataset") +``` \ No newline at end of file diff --git a/llama_stack/providers/remote/datasetio/nvidia/config.py b/llama_stack/providers/remote/datasetio/nvidia/config.py index f80c6bb20..7f3dbdfbd 100644 --- a/llama_stack/providers/remote/datasetio/nvidia/config.py +++ b/llama_stack/providers/remote/datasetio/nvidia/config.py @@ -55,7 +55,7 @@ class NvidiaDatasetIOConfig(BaseModel): def sample_run_config(cls, **kwargs) -> Dict[str, Any]: return { "api_key": "${env.NVIDIA_API_KEY:}", - "user_id": "${env.NVIDIA_USER_ID:llama-stack-user}", "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}", "project_id": "${env.NVIDIA_PROJECT_ID:test-project}", + "datasets_url": "${env.NVIDIA_DATASETS_URL:http://nemo.test}", } diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml index 31a454fa1..5f594604b 100644 --- a/llama_stack/templates/nvidia/run-with-safety.yaml +++ b/llama_stack/templates/nvidia/run-with-safety.yaml @@ -78,9 +78,9 @@ providers: provider_type: remote::nvidia config: api_key: ${env.NVIDIA_API_KEY:} - user_id: ${env.NVIDIA_USER_ID:llama-stack-user} dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:default} project_id: ${env.NVIDIA_PROJECT_ID:test-project} + datasets_url: ${env.NVIDIA_DATASETS_URL:http://nemo.test} scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 7488997d8..b55aacd64 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -73,9 +73,9 @@ providers: provider_type: remote::nvidia config: api_key: ${env.NVIDIA_API_KEY:} - user_id: ${env.NVIDIA_USER_ID:llama-stack-user} dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:default} project_id: ${env.NVIDIA_PROJECT_ID:test-project} + datasets_url: ${env.NVIDIA_DATASETS_URL:http://nemo.test} scoring: - provider_id: basic provider_type: inline::basic