mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
# What does this PR do? adds nvidia template for creating a distribution using inference adapter for NVIDIA NIMs. ## Test Plan Please describe: Build llama stack distribution for nvidia using the template, docker and conda. ```bash (.venv) local-cdgamarose@a4u8g-0006:~/llama-stack$ llama-stack-client configure --endpoint http://localhost:5000 Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000 (.venv) local-cdgamarose@a4u8g-0006:~/llama-stack$ llama-stack-client models list ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ provider_resource_id ┃ metadata ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ │ Llama3.1-8B-Instruct │ nvidia │ meta/llama-3.1-8b-instruct │ {} │ │ meta-llama/Llama-3.2-3B-Instruct │ nvidia │ meta/llama-3.2-3b-instruct │ {} │ └──────────────────────────────────┴─────────────┴────────────────────────────┴──────────┘ (.venv) local-cdgamarose@a4u8g-0006:~/llama-stack$ llama-stack-client inference chat-completion --message "hello, write me a 2 sentence poem" ChatCompletionResponse( completion_message=CompletionMessage( content='Here is a 2 sentence poem:\n\nThe sun sets slow and paints the sky, \nA gentle hue of pink that makes me sigh.', role='assistant', stop_reason='end_of_turn', tool_calls=[] ), logprobs=None ) ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --------- Co-authored-by: Matthew Farrellee <matt@cs.wisc.edu>
57 lines
2 KiB
Python
57 lines
2 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import os
|
|
from typing import Any, Dict, Optional
|
|
|
|
from llama_models.schema_utils import json_schema_type
|
|
from pydantic import BaseModel, Field, SecretStr
|
|
|
|
|
|
@json_schema_type
|
|
class NVIDIAConfig(BaseModel):
|
|
"""
|
|
Configuration for the NVIDIA NIM inference endpoint.
|
|
|
|
Attributes:
|
|
url (str): A base url for accessing the NVIDIA NIM, e.g. http://localhost:8000
|
|
api_key (str): The access key for the hosted NIM endpoints
|
|
|
|
There are two ways to access NVIDIA NIMs -
|
|
0. Hosted: Preview APIs hosted at https://integrate.api.nvidia.com
|
|
1. Self-hosted: You can run NVIDIA NIMs on your own infrastructure
|
|
|
|
By default the configuration is set to use the hosted APIs. This requires
|
|
an API key which can be obtained from https://ngc.nvidia.com/.
|
|
|
|
By default the configuration will attempt to read the NVIDIA_API_KEY environment
|
|
variable to set the api_key. Please do not put your API key in code.
|
|
|
|
If you are using a self-hosted NVIDIA NIM, you can set the url to the
|
|
URL of your running NVIDIA NIM and do not need to set the api_key.
|
|
"""
|
|
|
|
url: str = Field(
|
|
default_factory=lambda: os.getenv(
|
|
"NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"
|
|
),
|
|
description="A base url for accessing the NVIDIA NIM",
|
|
)
|
|
api_key: Optional[SecretStr] = Field(
|
|
default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
|
|
description="The NVIDIA API key, only needed of using the hosted service",
|
|
)
|
|
timeout: int = Field(
|
|
default=60,
|
|
description="Timeout for the HTTP requests",
|
|
)
|
|
|
|
@classmethod
|
|
def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
|
|
return {
|
|
"url": "https://integrate.api.nvidia.com",
|
|
"api_key": "${env.NVIDIA_API_KEY}",
|
|
}
|