diff --git a/distributions/dependencies.json b/distributions/dependencies.json index d2ed12d3a..33b497a33 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -401,16 +401,13 @@ ], "nvidia": [ "aiosqlite", - "autoevals", "blobfile", "chardet", - "datasets", "faiss-cpu", "fastapi", "fire", "httpx", "matplotlib", - "mcp", "nltk", "numpy", "openai", diff --git a/docs/source/distributions/remote_hosted_distro/nvidia.md b/docs/source/distributions/remote_hosted_distro/nvidia.md index efa0a2d74..774d5ec1b 100644 --- a/docs/source/distributions/remote_hosted_distro/nvidia.md +++ b/docs/source/distributions/remote_hosted_distro/nvidia.md @@ -6,13 +6,13 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | +| datasetio | `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::nvidia` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | +| safety | `remote::nvidia` | +| scoring | `inline::basic` | | telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | +| tool_runtime | `inline::rag-runtime` | | vector_io | `inline::faiss` | @@ -20,8 +20,10 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov The following environment variables can be configured: -- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``) +- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`) +- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`) +- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`) ### Models diff --git a/llama_stack/providers/registry/safety.py b/llama_stack/providers/registry/safety.py index 1364352e6..32c0b4e98 100644 --- a/llama_stack/providers/registry/safety.py +++ b/llama_stack/providers/registry/safety.py @@ -55,4 +55,13 @@ def available_providers() -> List[ProviderSpec]: config_class="llama_stack.providers.remote.safety.bedrock.BedrockSafetyConfig", ), ), + remote_provider_spec( + api=Api.safety, + adapter=AdapterSpec( + adapter_type="nvidia", + pip_packages=["requests"], + module="llama_stack.providers.remote.safety.nvidia", + config_class="llama_stack.providers.remote.safety.nvidia.NVIDIASafetyConfig", + ), + ), ] diff --git a/llama_stack/providers/remote/safety/nvidia/__init__.py b/llama_stack/providers/remote/safety/nvidia/__init__.py new file mode 100644 index 000000000..4677268c6 --- /dev/null +++ b/llama_stack/providers/remote/safety/nvidia/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +from typing import Any + +from .config import NVIDIASafetyConfig + + +async def get_adapter_impl(config: NVIDIASafetyConfig, _deps) -> Any: + from .nvidia import NVIDIASafetyAdapter + + impl = NVIDIASafetyAdapter(config) + await impl.initialize() + return impl diff --git a/llama_stack/providers/remote/safety/nvidia/config.py b/llama_stack/providers/remote/safety/nvidia/config.py new file mode 100644 index 000000000..3df80ed4f --- /dev/null +++ b/llama_stack/providers/remote/safety/nvidia/config.py @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import os +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +@json_schema_type +class NVIDIASafetyConfig(BaseModel): + """ + Configuration for the NVIDIA Guardrail microservice endpoint. + + Attributes: + guardrails_service_url (str): A base url for accessing the NVIDIA guardrail endpoint, e.g. http://0.0.0.0:7331 + config_id (str): The ID of the guardrails configuration to use from the configuration store + (https://developer.nvidia.com/docs/nemo-microservices/guardrails/source/guides/configuration-store-guide.html) + + """ + + guardrails_service_url: str = Field( + default_factory=lambda: os.getenv("GUARDRAILS_SERVICE_URL", "http://0.0.0.0:7331"), + description="The url for accessing the guardrails service", + ) + config_id: Optional[str] = Field(default="self-check", description="Config ID to use from the config store") + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "guardrails_service_url": "${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}", + "config_id": "self-check", + } diff --git a/llama_stack/providers/remote/safety/nvidia/nvidia.py b/llama_stack/providers/remote/safety/nvidia/nvidia.py new file mode 100644 index 000000000..6da2a8344 --- /dev/null +++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py @@ -0,0 +1,154 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import logging +from typing import Any, List, Optional + +import requests + +from llama_stack.apis.inference import Message +from llama_stack.apis.safety import RunShieldResponse, Safety, SafetyViolation, ViolationLevel +from llama_stack.apis.shields import Shield +from llama_stack.distribution.library_client import convert_pydantic_to_json_value +from llama_stack.providers.datatypes import ShieldsProtocolPrivate + +from .config import NVIDIASafetyConfig + +logger = logging.getLogger(__name__) + + +class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate): + def __init__(self, config: NVIDIASafetyConfig) -> None: + """ + Initialize the NVIDIASafetyAdapter with a given safety configuration. + + Args: + config (NVIDIASafetyConfig): The configuration containing the guardrails service URL and config ID. + """ + print(f"Initializing NVIDIASafetyAdapter({config.guardrails_service_url})...") + self.config = config + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def register_shield(self, shield: Shield) -> None: + if not shield.provider_resource_id: + raise ValueError("Shield model not provided.") + + async def run_shield( + self, shield_id: str, messages: List[Message], params: Optional[dict[str, Any]] = None + ) -> RunShieldResponse: + """ + Run a safety shield check against the provided messages. + + Args: + shield_id (str): The unique identifier for the shield to be used. + messages (List[Message]): A list of Message objects representing the conversation history. + params (Optional[dict[str, Any]]): Additional parameters for the shield check. + + Returns: + RunShieldResponse: The response containing safety violation details if any. + + Raises: + ValueError: If the shield with the provided shield_id is not found. + """ + shield = await self.shield_store.get_shield(shield_id) + if not shield: + raise ValueError(f"Shield {shield_id} not found") + + self.shield = NeMoGuardrails(self.config, shield.shield_id) + return await self.shield.run(messages) + + +class NeMoGuardrails: + """ + A class that encapsulates NVIDIA's guardrails safety logic. + + Sends messages to the guardrails service and interprets the response to determine + if a safety violation has occurred. + """ + + def __init__( + self, + config: NVIDIASafetyConfig, + model: str, + threshold: float = 0.9, + temperature: float = 1.0, + ): + """ + Initialize a NeMoGuardrails instance with the provided parameters. + + Args: + config (NVIDIASafetyConfig): The safety configuration containing the config ID and guardrails URL. + model (str): The identifier or name of the model to be used for safety checks. + threshold (float, optional): The threshold for flagging violations. Defaults to 0.9. + temperature (float, optional): The temperature setting for the underlying model. Must be greater than 0. Defaults to 1.0. + + Raises: + ValueError: If temperature is less than or equal to 0. + AssertionError: If config_id is not provided in the configuration. + """ + self.config_id = config.config_id + self.model = model + assert self.config_id is not None, "Must provide config id" + if temperature <= 0: + raise ValueError("Temperature must be greater than 0") + + self.temperature = temperature + self.threshold = threshold + self.guardrails_service_url = config.guardrails_service_url + + async def run(self, messages: List[Message]) -> RunShieldResponse: + """ + Queries the /v1/guardrails/checks endpoint of the NeMo guardrails deployed API. + + Args: + messages (List[Message]): A list of Message objects to be checked for safety violations. + + Returns: + RunShieldResponse: If the response indicates a violation ("blocked" status), returns a + RunShieldResponse with a SafetyViolation; otherwise, returns a RunShieldResponse with violation set to None. + + Raises: + requests.HTTPError: If the POST request fails. + """ + headers = { + "Accept": "application/json", + } + request_data = { + "model": self.model, + "messages": convert_pydantic_to_json_value(messages), + "temperature": self.temperature, + "top_p": 1, + "frequency_penalty": 0, + "presence_penalty": 0, + "max_tokens": 160, + "stream": False, + "guardrails": { + "config_id": self.config_id, + }, + } + response = requests.post( + url=f"{self.guardrails_service_url}/v1/guardrail/checks", headers=headers, json=request_data + ) + response.raise_for_status() + if "Content-Type" in response.headers and response.headers["Content-Type"].startswith("application/json"): + response_json = response.json() + if response_json["status"] == "blocked": + user_message = "Sorry I cannot do this." + metadata = response_json["rails_status"] + + return RunShieldResponse( + violation=SafetyViolation( + user_message=user_message, + violation_level=ViolationLevel.ERROR, + metadata=metadata, + ) + ) + return RunShieldResponse(violation=None) diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index e9748721a..0c788ce86 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -1,13 +1,13 @@ version: '2' distribution_spec: - description: Use NVIDIA NIM for running LLM inference + description: Use NVIDIA NIM for running LLM inference and safety providers: inference: - remote::nvidia vector_io: - inline::faiss safety: - - inline::llama-guard + - remote::nvidia agents: - inline::meta-reference telemetry: @@ -15,16 +15,9 @@ distribution_spec: eval: - inline::meta-reference datasetio: - - remote::huggingface - inline::localfs scoring: - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::code-interpreter - inline::rag-runtime - - remote::model-context-protocol image_type: conda diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index cc5e96333..308c0e2a6 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -6,9 +6,10 @@ from pathlib import Path -from llama_stack.distribution.datatypes import Provider, ToolGroupInput +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES +from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry @@ -16,19 +17,13 @@ def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::nvidia"], "vector_io": ["inline::faiss"], - "safety": ["inline::llama-guard"], + "safety": ["remote::nvidia"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::code-interpreter", - "inline::rag-runtime", - "remote::model-context-protocol", - ], + "datasetio": ["inline::localfs"], + "scoring": ["inline::basic"], + "tool_runtime": ["inline::rag-runtime"], } inference_provider = Provider( @@ -36,30 +31,35 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::nvidia", config=NVIDIAConfig.sample_run_config(), ) + safety_provider = Provider( + provider_id="nvidia", + provider_type="remote::nvidia", + config=NVIDIASafetyConfig.sample_run_config(), + ) + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="nvidia", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="nvidia", + ) available_models = { "nvidia": MODEL_ENTRIES, } default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), ToolGroupInput( toolgroup_id="builtin::rag", provider_id="rag-runtime", ), - ToolGroupInput( - toolgroup_id="builtin::code_interpreter", - provider_id="code-interpreter", - ), ] default_models = get_model_registry(available_models) return DistributionTemplate( name="nvidia", distro_type="remote_hosted", - description="Use NVIDIA NIM for running LLM inference", + description="Use NVIDIA NIM for running LLM inference and safety", container_image=None, template_path=Path(__file__).parent / "doc_template.md", providers=providers, @@ -72,15 +72,34 @@ def get_distribution_template() -> DistributionTemplate: default_models=default_models, default_tool_groups=default_tool_groups, ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + safety_provider, + ] + }, + default_models=[inference_model, safety_model], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")], + default_tool_groups=default_tool_groups, + ), }, run_config_env_vars={ - "LLAMASTACK_PORT": ( - "5001", - "Port for the Llama Stack distribution server", - ), "NVIDIA_API_KEY": ( "", "NVIDIA API Key", ), + "GUARDRAILS_SERVICE_URL": ( + "http://0.0.0.0:7331", + "URL for the NeMo Guardrails Service", + ), + "INFERENCE_MODEL": ( + "Llama3.1-8B-Instruct", + "Inference model", + ), + "SAFETY_MODEL": ( + "meta/llama-3.1-8b-instruct", + "Name of the model to use for safety", + ), }, ) diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml new file mode 100644 index 000000000..04da1bcda --- /dev/null +++ b/llama_stack/templates/nvidia/run-with-safety.yaml @@ -0,0 +1,101 @@ +version: '2' +image_name: nvidia +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: nvidia + provider_type: remote::nvidia + config: + url: ${env.NVIDIA_BASE_URL:https://integrate.api.nvidia.com} + api_key: ${env.NVIDIA_API_KEY:} + - provider_id: nvidia + provider_type: remote::nvidia + config: + guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331} + config_id: self-check + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db + safety: + - provider_id: nvidia + provider_type: remote::nvidia + config: + guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331} + config_id: self-check + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db + datasetio: + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + tool_runtime: + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: nvidia + model_type: llm +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: nvidia + model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL} + provider_id: nvidia +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::rag + provider_id: rag-runtime +server: + port: 8321 diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 213e22cb2..3abdd82a7 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -26,10 +26,11 @@ providers: namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db safety: - - provider_id: llama-guard - provider_type: inline::llama-guard + - provider_id: nvidia + provider_type: remote::nvidia config: - excluded_categories: [] + guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331} + config_id: self-check agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -54,13 +55,6 @@ providers: namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs config: @@ -72,33 +66,10 @@ providers: - provider_id: basic provider_type: inline::basic config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:} - max_results: 3 - - provider_id: code-interpreter - provider_type: inline::code-interpreter - config: {} - provider_id: rag-runtime provider_type: inline::rag-runtime config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db @@ -227,11 +198,7 @@ datasets: [] scoring_fns: [] benchmarks: [] tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search - toolgroup_id: builtin::rag provider_id: rag-runtime -- toolgroup_id: builtin::code_interpreter - provider_id: code-interpreter server: port: 8321 diff --git a/pyproject.toml b/pyproject.toml index 4a5befbd0..a006d69f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -269,6 +269,7 @@ exclude = [ "^llama_stack/providers/remote/inference/together/", "^llama_stack/providers/remote/inference/vllm/", "^llama_stack/providers/remote/safety/bedrock/", + "^llama_stack/providers/remote/safety/nvidia/", "^llama_stack/providers/remote/safety/sample/", "^llama_stack/providers/remote/tool_runtime/bing_search/", "^llama_stack/providers/remote/tool_runtime/brave_search/",