From 26a093a1d43c11f6c22579fbf3de0dca17d265bb Mon Sep 17 00:00:00 2001 From: Chantal D Gama Rose Date: Fri, 10 Jan 2025 21:57:35 +0000 Subject: [PATCH] updated nvidia distro with recent apis --- distributions/inline-nvidia/compose.yaml | 4 +- distributions/inline-nvidia/run.yaml | 54 +++++++++++++-- .../remote_hosted_distro/nvidia.md | 65 +++++++++++++++++++ .../remote/inference/nvidia/config.py | 9 ++- llama_stack/templates/nvidia/build.yaml | 17 ++++- llama_stack/templates/nvidia/doc_template.md | 1 + llama_stack/templates/nvidia/nvidia.py | 12 +++- llama_stack/templates/nvidia/run.yaml | 51 ++++++++++++++- 8 files changed, 197 insertions(+), 16 deletions(-) create mode 100644 docs/source/distributions/remote_hosted_distro/nvidia.md diff --git a/distributions/inline-nvidia/compose.yaml b/distributions/inline-nvidia/compose.yaml index f7320b968..644b7d23d 100644 --- a/distributions/inline-nvidia/compose.yaml +++ b/distributions/inline-nvidia/compose.yaml @@ -1,6 +1,6 @@ services: nim: - image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest + image: ${DOCKER_IMAGE:-nvcr.io/nim/meta/llama-3.1-8b-instruct:latest} network_mode: "host" volumes: - nim-llm-cache:/opt/nim/.cache @@ -55,4 +55,4 @@ services: window: 60s volumes: nim-llm-cache: - driver: local \ No newline at end of file + driver: local diff --git a/distributions/inline-nvidia/run.yaml b/distributions/inline-nvidia/run.yaml index 81e9e7f1c..e96a0429c 100644 --- a/distributions/inline-nvidia/run.yaml +++ b/distributions/inline-nvidia/run.yaml @@ -1,20 +1,23 @@ version: '2' image_name: nvidia -docker_image: null conda_env: nvidia apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry +- tool_runtime providers: inference: - provider_id: nvidia provider_type: remote::nvidia config: url: http://localhost:8000 - api_key: ${env.NVIDIA_API_KEY} + api_key: ${env.NVIDIA_API_KEY} # TODO: don't need api key, code adjustments needed memory: - provider_id: faiss provider_type: inline::faiss @@ -38,19 +41,60 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: memory-runtime + provider_type: inline::memory-runtime config: {} metadata_store: - namespace: null type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db models: - metadata: {} model_id: ${env.INFERENCE_MODEL} provider_id: nvidia - provider_model_id: null + model_type: llm shields: [] memory_banks: [] datasets: [] scoring_fns: [] eval_tasks: [] - +tool_groups: [] diff --git a/docs/source/distributions/remote_hosted_distro/nvidia.md b/docs/source/distributions/remote_hosted_distro/nvidia.md new file mode 100644 index 000000000..874bb8bb2 --- /dev/null +++ b/docs/source/distributions/remote_hosted_distro/nvidia.md @@ -0,0 +1,65 @@ +# NVIDIA Distribution + +The `llamastack/distribution-nvidia` distribution consists of the following provider configurations. + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | +| inference | `remote::nvidia` | +| memory | `inline::faiss` | +| safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | +| telemetry | `inline::meta-reference` | +| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | + + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``) + +### Models + +The following models are available by default: + +- `${env.INFERENCE_MODEL} (None)` + + +### Prerequisite: API Keys + +Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). + + +## Running Llama Stack with NVIDIA + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-nvidia \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env NVIDIA_API_KEY=$NVIDIA_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template nvidia --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env NVIDIA_API_KEY=$NVIDIA_API_KEY + --env INFERENCE=$INFERENCE_MODEL +``` diff --git a/llama_stack/providers/remote/inference/nvidia/config.py b/llama_stack/providers/remote/inference/nvidia/config.py index 9e81211bd..d062e65d2 100644 --- a/llama_stack/providers/remote/inference/nvidia/config.py +++ b/llama_stack/providers/remote/inference/nvidia/config.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import os -from typing import Optional +from typing import Any, Dict, Optional from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field, SecretStr @@ -48,3 +48,10 @@ class NVIDIAConfig(BaseModel): default=60, description="Timeout for the HTTP requests", ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "url": "https://integrate.api.nvidia.com", + "api_key": "${env.NVIDIA_API_KEY}", + } diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index 9a735c220..813502ada 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -2,18 +2,29 @@ version: '2' name: nvidia distribution_spec: description: Use NVIDIA NIM for running LLM inference - docker_image: null providers: inference: - remote::nvidia memory: - inline::faiss - - remote::chromadb - - remote::pgvector safety: - inline::llama-guard agents: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust + tool_runtime: + - remote::brave-search + - remote::tavily-search + - inline::code-interpreter + - inline::memory-runtime image_type: conda diff --git a/llama_stack/templates/nvidia/doc_template.md b/llama_stack/templates/nvidia/doc_template.md index 949018f8d..9d9006a27 100644 --- a/llama_stack/templates/nvidia/doc_template.md +++ b/llama_stack/templates/nvidia/doc_template.md @@ -57,4 +57,5 @@ llama stack build --template nvidia --image-type conda llama stack run ./run.yaml \ --port 5001 \ --env NVIDIA_API_KEY=$NVIDIA_API_KEY + --env INFERENCE_MODEL=$INFERENCE_MODEL ``` diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index 22aa1f4b0..173db2d7f 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -8,7 +8,6 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig -from llama_stack.providers.remote.inference.nvidia.nvidia import _MODEL_ALIASES from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,10 +15,19 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::nvidia"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "memory": ["inline::faiss"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "inline::code-interpreter", + "inline::memory-runtime", + ], } inference_provider = Provider( diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index f4953852f..84b0437ba 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -1,13 +1,16 @@ version: '2' image_name: nvidia -docker_image: null conda_env: nvidia apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry +- tool_runtime providers: inference: - provider_id: nvidia @@ -38,18 +41,60 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: memory-runtime + provider_type: inline::memory-runtime config: {} metadata_store: - namespace: null type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db models: - metadata: {} model_id: ${env.INFERENCE_MODEL} provider_id: nvidia - provider_model_id: null + model_type: llm shields: [] memory_banks: [] datasets: [] scoring_fns: [] eval_tasks: [] +tool_groups: []