mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
# What does this PR do? Adds custom model registration functionality to NVIDIAInferenceAdapter which let's the inference happen on: - post-training model - non-llama models in API Catalogue(behind https://integrate.api.nvidia.com and endpoints compatible with AyncOpenAI) ## Example Usage: ```python from llama_stack.apis.models import Model, ModelType from llama_stack.distribution.library_client import LlamaStackAsLibraryClient client = LlamaStackAsLibraryClient("nvidia") _ = client.initialize() client.models.register( model_id=model_name, model_type=ModelType.llm, provider_id="nvidia" ) response = client.inference.chat_completion( model_id=model_name, messages=[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"Write a limerick about the wonders of GPU computing."}], ) ``` ## Test Plan ```bash pytest tests/unit/providers/nvidia/test_supervised_fine_tuning.py ========================================================== test session starts =========================================================== platform linux -- Python 3.10.0, pytest-8.3.5, pluggy-1.5.0 rootdir: /home/ubuntu/llama-stack configfile: pyproject.toml plugins: anyio-4.9.0 collected 6 items tests/unit/providers/nvidia/test_supervised_fine_tuning.py ...... [100%] ============================================================ warnings summary ============================================================ ../miniconda/envs/nvidia-1/lib/python3.10/site-packages/pydantic/fields.py:1076 /home/ubuntu/miniconda/envs/nvidia-1/lib/python3.10/site-packages/pydantic/fields.py:1076: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'contentEncoding'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/ warn( -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ====================================================== 6 passed, 1 warning in 1.51s ====================================================== ``` [//]: # (## Documentation) Updated Readme.md cc: @dglogo, @sumitb, @mattf
139 lines
4.8 KiB
Python
139 lines
4.8 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from pathlib import Path
|
|
|
|
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
|
|
from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
|
|
from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
|
|
from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
|
|
from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
|
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
|
|
|
|
|
def get_distribution_template() -> DistributionTemplate:
|
|
providers = {
|
|
"inference": ["remote::nvidia"],
|
|
"vector_io": ["inline::faiss"],
|
|
"safety": ["remote::nvidia"],
|
|
"agents": ["inline::meta-reference"],
|
|
"telemetry": ["inline::meta-reference"],
|
|
"eval": ["remote::nvidia"],
|
|
"post_training": ["remote::nvidia"],
|
|
"datasetio": ["inline::localfs"],
|
|
"scoring": ["inline::basic"],
|
|
"tool_runtime": ["inline::rag-runtime"],
|
|
}
|
|
|
|
inference_provider = Provider(
|
|
provider_id="nvidia",
|
|
provider_type="remote::nvidia",
|
|
config=NVIDIAConfig.sample_run_config(),
|
|
)
|
|
safety_provider = Provider(
|
|
provider_id="nvidia",
|
|
provider_type="remote::nvidia",
|
|
config=NVIDIASafetyConfig.sample_run_config(),
|
|
)
|
|
eval_provider = Provider(
|
|
provider_id="nvidia",
|
|
provider_type="remote::nvidia",
|
|
config=NVIDIAEvalConfig.sample_run_config(),
|
|
)
|
|
inference_model = ModelInput(
|
|
model_id="${env.INFERENCE_MODEL}",
|
|
provider_id="nvidia",
|
|
)
|
|
safety_model = ModelInput(
|
|
model_id="${env.SAFETY_MODEL}",
|
|
provider_id="nvidia",
|
|
)
|
|
|
|
available_models = {
|
|
"nvidia": MODEL_ENTRIES,
|
|
}
|
|
default_tool_groups = [
|
|
ToolGroupInput(
|
|
toolgroup_id="builtin::rag",
|
|
provider_id="rag-runtime",
|
|
),
|
|
]
|
|
|
|
default_models = get_model_registry(available_models)
|
|
return DistributionTemplate(
|
|
name="nvidia",
|
|
distro_type="self_hosted",
|
|
description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
|
|
container_image=None,
|
|
template_path=Path(__file__).parent / "doc_template.md",
|
|
providers=providers,
|
|
available_models_by_provider=available_models,
|
|
run_configs={
|
|
"run.yaml": RunConfigSettings(
|
|
provider_overrides={
|
|
"inference": [inference_provider],
|
|
"eval": [eval_provider],
|
|
},
|
|
default_models=default_models,
|
|
default_tool_groups=default_tool_groups,
|
|
),
|
|
"run-with-safety.yaml": RunConfigSettings(
|
|
provider_overrides={
|
|
"inference": [
|
|
inference_provider,
|
|
safety_provider,
|
|
],
|
|
"eval": [eval_provider],
|
|
},
|
|
default_models=[inference_model, safety_model],
|
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
|
|
default_tool_groups=default_tool_groups,
|
|
),
|
|
},
|
|
run_config_env_vars={
|
|
"NVIDIA_API_KEY": (
|
|
"",
|
|
"NVIDIA API Key",
|
|
),
|
|
"NVIDIA_APPEND_API_VERSION": (
|
|
"True",
|
|
"Whether to append the API version to the base_url",
|
|
),
|
|
## Nemo Customizer related variables
|
|
"NVIDIA_DATASET_NAMESPACE": (
|
|
"default",
|
|
"NVIDIA Dataset Namespace",
|
|
),
|
|
"NVIDIA_PROJECT_ID": (
|
|
"test-project",
|
|
"NVIDIA Project ID",
|
|
),
|
|
"NVIDIA_CUSTOMIZER_URL": (
|
|
"https://customizer.api.nvidia.com",
|
|
"NVIDIA Customizer URL",
|
|
),
|
|
"NVIDIA_OUTPUT_MODEL_DIR": (
|
|
"test-example-model@v1",
|
|
"NVIDIA Output Model Directory",
|
|
),
|
|
"GUARDRAILS_SERVICE_URL": (
|
|
"http://0.0.0.0:7331",
|
|
"URL for the NeMo Guardrails Service",
|
|
),
|
|
"NVIDIA_EVALUATOR_URL": (
|
|
"http://0.0.0.0:7331",
|
|
"URL for the NeMo Evaluator Service",
|
|
),
|
|
"INFERENCE_MODEL": (
|
|
"Llama3.1-8B-Instruct",
|
|
"Inference model",
|
|
),
|
|
"SAFETY_MODEL": (
|
|
"meta/llama-3.1-8b-instruct",
|
|
"Name of the model to use for safety",
|
|
),
|
|
},
|
|
)
|