mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 17:29:01 +00:00
Merge branch 'main' into nvidia-eval-integration
This commit is contained in:
commit
2117af25a7
27 changed files with 748 additions and 159 deletions
6
.github/workflows/providers-build.yml
vendored
6
.github/workflows/providers-build.yml
vendored
|
@ -86,15 +86,15 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@v5
|
uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: "3.10"
|
||||||
|
|
||||||
|
|
29
docs/_static/js/detect_theme.js
vendored
29
docs/_static/js/detect_theme.js
vendored
|
@ -1,9 +1,32 @@
|
||||||
document.addEventListener("DOMContentLoaded", function () {
|
document.addEventListener("DOMContentLoaded", function () {
|
||||||
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
||||||
const htmlElement = document.documentElement;
|
const htmlElement = document.documentElement;
|
||||||
if (prefersDark) {
|
|
||||||
htmlElement.setAttribute("data-theme", "dark");
|
// Check if theme is saved in localStorage
|
||||||
|
const savedTheme = localStorage.getItem("sphinx-rtd-theme");
|
||||||
|
|
||||||
|
if (savedTheme) {
|
||||||
|
// Use the saved theme preference
|
||||||
|
htmlElement.setAttribute("data-theme", savedTheme);
|
||||||
|
document.body.classList.toggle("dark", savedTheme === "dark");
|
||||||
} else {
|
} else {
|
||||||
htmlElement.setAttribute("data-theme", "light");
|
// Fall back to system preference
|
||||||
|
const theme = prefersDark ? "dark" : "light";
|
||||||
|
htmlElement.setAttribute("data-theme", theme);
|
||||||
|
document.body.classList.toggle("dark", theme === "dark");
|
||||||
|
// Save initial preference
|
||||||
|
localStorage.setItem("sphinx-rtd-theme", theme);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Listen for theme changes from the existing toggle
|
||||||
|
const observer = new MutationObserver(function(mutations) {
|
||||||
|
mutations.forEach(function(mutation) {
|
||||||
|
if (mutation.attributeName === "data-theme") {
|
||||||
|
const currentTheme = htmlElement.getAttribute("data-theme");
|
||||||
|
localStorage.setItem("sphinx-rtd-theme", currentTheme);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
observer.observe(htmlElement, { attributes: true });
|
||||||
});
|
});
|
||||||
|
|
22
docs/_static/llama-stack-spec.html
vendored
22
docs/_static/llama-stack-spec.html
vendored
|
@ -5221,17 +5221,25 @@
|
||||||
"default": 10
|
"default": 10
|
||||||
},
|
},
|
||||||
"model": {
|
"model": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"description": "The model identifier to use for the agent"
|
||||||
},
|
},
|
||||||
"instructions": {
|
"instructions": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"description": "The system instructions for the agent"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Optional name for the agent, used in telemetry and identification"
|
||||||
},
|
},
|
||||||
"enable_session_persistence": {
|
"enable_session_persistence": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"default": false
|
"default": false,
|
||||||
|
"description": "Optional flag indicating whether session data has to be persisted"
|
||||||
},
|
},
|
||||||
"response_format": {
|
"response_format": {
|
||||||
"$ref": "#/components/schemas/ResponseFormat"
|
"$ref": "#/components/schemas/ResponseFormat",
|
||||||
|
"description": "Optional response format configuration"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
@ -5239,7 +5247,8 @@
|
||||||
"model",
|
"model",
|
||||||
"instructions"
|
"instructions"
|
||||||
],
|
],
|
||||||
"title": "AgentConfig"
|
"title": "AgentConfig",
|
||||||
|
"description": "Configuration for an agent."
|
||||||
},
|
},
|
||||||
"AgentTool": {
|
"AgentTool": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
|
@ -8891,8 +8900,7 @@
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"role",
|
"role"
|
||||||
"content"
|
|
||||||
],
|
],
|
||||||
"title": "OpenAIAssistantMessageParam",
|
"title": "OpenAIAssistantMessageParam",
|
||||||
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
|
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
|
||||||
|
|
12
docs/_static/llama-stack-spec.yaml
vendored
12
docs/_static/llama-stack-spec.yaml
vendored
|
@ -3686,18 +3686,29 @@ components:
|
||||||
default: 10
|
default: 10
|
||||||
model:
|
model:
|
||||||
type: string
|
type: string
|
||||||
|
description: >-
|
||||||
|
The model identifier to use for the agent
|
||||||
instructions:
|
instructions:
|
||||||
type: string
|
type: string
|
||||||
|
description: The system instructions for the agent
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Optional name for the agent, used in telemetry and identification
|
||||||
enable_session_persistence:
|
enable_session_persistence:
|
||||||
type: boolean
|
type: boolean
|
||||||
default: false
|
default: false
|
||||||
|
description: >-
|
||||||
|
Optional flag indicating whether session data has to be persisted
|
||||||
response_format:
|
response_format:
|
||||||
$ref: '#/components/schemas/ResponseFormat'
|
$ref: '#/components/schemas/ResponseFormat'
|
||||||
|
description: Optional response format configuration
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- model
|
- model
|
||||||
- instructions
|
- instructions
|
||||||
title: AgentConfig
|
title: AgentConfig
|
||||||
|
description: Configuration for an agent.
|
||||||
AgentTool:
|
AgentTool:
|
||||||
oneOf:
|
oneOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
@ -6097,7 +6108,6 @@ components:
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- role
|
- role
|
||||||
- content
|
|
||||||
title: OpenAIAssistantMessageParam
|
title: OpenAIAssistantMessageParam
|
||||||
description: >-
|
description: >-
|
||||||
A message containing the model's (assistant) response in an OpenAI-compatible
|
A message containing the model's (assistant) response in an OpenAI-compatible
|
||||||
|
|
|
@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
|
||||||
Add the following dependency in your `build.gradle.kts` file:
|
Add the following dependency in your `build.gradle.kts` file:
|
||||||
```
|
```
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
|
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
|
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
|
||||||
|
@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
|
||||||
|
|
||||||
Include the ExecuTorch library by:
|
Include the ExecuTorch library by:
|
||||||
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
|
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
|
||||||
2. Move the script to the top level of your Android app where the app directory resides:
|
2. Move the script to the top level of your Android app where the `app` directory resides.
|
||||||
<p align="center">
|
|
||||||
<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
|
|
||||||
</p>
|
|
||||||
|
|
||||||
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
|
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
|
||||||
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
|
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
|
||||||
```
|
```
|
||||||
|
@ -52,6 +48,8 @@ dependencies {
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
|
||||||
|
|
||||||
## Llama Stack APIs in Your Android App
|
## Llama Stack APIs in Your Android App
|
||||||
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
|
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
|
||||||
|
|
||||||
|
@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
|
||||||
```
|
```
|
||||||
conda create -n stack-fireworks python=3.10
|
conda create -n stack-fireworks python=3.10
|
||||||
conda activate stack-fireworks
|
conda activate stack-fireworks
|
||||||
pip install --no-cache llama-stack==0.1.4
|
pip install --no-cache llama-stack==0.2.2
|
||||||
llama stack build --template fireworks --image-type conda
|
llama stack build --template fireworks --image-type conda
|
||||||
export FIREWORKS_API_KEY=<SOME_KEY>
|
export FIREWORKS_API_KEY=<SOME_KEY>
|
||||||
llama stack run fireworks --port 5050
|
llama stack run fireworks --port 5050
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
|
||||||
# NVIDIA Distribution
|
|
||||||
|
|
||||||
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
|
|
||||||
|
|
||||||
| API | Provider(s) |
|
|
||||||
|-----|-------------|
|
|
||||||
| agents | `inline::meta-reference` |
|
|
||||||
| datasetio | `inline::localfs` |
|
|
||||||
| eval | `remote::nvidia` |
|
|
||||||
| inference | `remote::nvidia` |
|
|
||||||
| post_training | `remote::nvidia` |
|
|
||||||
| safety | `remote::nvidia` |
|
|
||||||
| scoring | `inline::basic` |
|
|
||||||
| telemetry | `inline::meta-reference` |
|
|
||||||
| tool_runtime | `inline::rag-runtime` |
|
|
||||||
| vector_io | `inline::faiss` |
|
|
||||||
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
The following environment variables can be configured:
|
|
||||||
|
|
||||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
|
||||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
|
||||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
|
||||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
|
||||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
|
||||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
|
||||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
|
||||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
|
||||||
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
|
||||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
|
||||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
|
||||||
|
|
||||||
### Models
|
|
||||||
|
|
||||||
The following models are available by default:
|
|
||||||
|
|
||||||
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
|
|
||||||
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
|
|
||||||
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
|
||||||
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
|
|
||||||
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
|
|
||||||
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
|
||||||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
|
||||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
|
||||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
|
||||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
|
||||||
- `nvidia/nv-embedqa-e5-v5 `
|
|
||||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
|
||||||
- `snowflake/arctic-embed-l `
|
|
||||||
|
|
||||||
|
|
||||||
### Prerequisite: API Keys
|
|
||||||
|
|
||||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
|
||||||
|
|
||||||
|
|
||||||
## Running Llama Stack with NVIDIA
|
|
||||||
|
|
||||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
|
||||||
|
|
||||||
### Via Docker
|
|
||||||
|
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
LLAMA_STACK_PORT=8321
|
|
||||||
docker run \
|
|
||||||
-it \
|
|
||||||
--pull always \
|
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-nvidia \
|
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
|
||||||
```
|
|
||||||
|
|
||||||
### Via Conda
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama stack build --template nvidia --image-type conda
|
|
||||||
llama stack run ./run.yaml \
|
|
||||||
--port 8321 \
|
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
|
||||||
```
|
|
|
@ -46,20 +46,91 @@ The following models are available by default:
|
||||||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||||
|
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||||
- `nvidia/nv-embedqa-e5-v5 `
|
- `nvidia/nv-embedqa-e5-v5 `
|
||||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||||
- `snowflake/arctic-embed-l `
|
- `snowflake/arctic-embed-l `
|
||||||
|
|
||||||
|
|
||||||
### Prerequisite: API Keys
|
## Prerequisites
|
||||||
|
### NVIDIA API Keys
|
||||||
|
|
||||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||||
|
|
||||||
|
### Deploy NeMo Microservices Platform
|
||||||
|
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||||
|
|
||||||
|
## Supported Services
|
||||||
|
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||||
|
|
||||||
|
### Inference: NVIDIA NIM
|
||||||
|
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||||
|
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||||
|
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||||
|
|
||||||
|
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||||
|
|
||||||
|
### Datasetio API: NeMo Data Store
|
||||||
|
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||||
|
|
||||||
|
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||||
|
|
||||||
|
### Eval API: NeMo Evaluator
|
||||||
|
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||||
|
|
||||||
|
### Post-Training API: NeMo Customizer
|
||||||
|
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||||
|
|
||||||
|
### Safety API: NeMo Guardrails
|
||||||
|
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the NVIDIA Safety docs for supported features and example usage.
|
||||||
|
|
||||||
|
## Deploying models
|
||||||
|
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||||
|
|
||||||
|
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||||
|
```sh
|
||||||
|
# URL to NeMo NIM Proxy service
|
||||||
|
export NEMO_URL="http://nemo.test"
|
||||||
|
|
||||||
|
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{
|
||||||
|
"name": "llama-3.2-1b-instruct",
|
||||||
|
"namespace": "meta",
|
||||||
|
"config": {
|
||||||
|
"model": "meta/llama-3.2-1b-instruct",
|
||||||
|
"nim_deployment": {
|
||||||
|
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||||
|
"image_tag": "1.8.3",
|
||||||
|
"pvc_size": "25Gi",
|
||||||
|
"gpu": 1,
|
||||||
|
"additional_envs": {
|
||||||
|
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||||
|
|
||||||
|
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||||
|
```sh
|
||||||
|
export NEMO_URL="http://nemo.test"
|
||||||
|
|
||||||
|
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||||
|
```
|
||||||
|
|
||||||
## Running Llama Stack with NVIDIA
|
## Running Llama Stack with NVIDIA
|
||||||
|
|
||||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -81,9 +152,27 @@ docker run \
|
||||||
### Via Conda
|
### Via Conda
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
llama stack build --template nvidia --image-type conda
|
llama stack build --template nvidia --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 8321 \
|
--port 8321 \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Via venv
|
||||||
|
|
||||||
|
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
|
llama stack build --template nvidia --image-type venv
|
||||||
|
llama stack run ./run.yaml \
|
||||||
|
--port 8321 \
|
||||||
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Notebooks
|
||||||
|
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||||
|
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||||
|
|
|
@ -41,7 +41,7 @@ The following environment variables can be configured:
|
||||||
|
|
||||||
## Setting up vLLM server
|
## Setting up vLLM server
|
||||||
|
|
||||||
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
|
In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
|
||||||
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
||||||
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
||||||
that we only use GPUs here for demonstration purposes.
|
that we only use GPUs here for demonstration purposes.
|
||||||
|
@ -162,6 +162,55 @@ docker run \
|
||||||
--port $SAFETY_PORT
|
--port $SAFETY_PORT
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Setting up vLLM server on Intel GPU
|
||||||
|
|
||||||
|
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
|
||||||
|
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
|
||||||
|
|
||||||
|
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export INFERENCE_PORT=8000
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
export ZE_AFFINITY_MASK=0
|
||||||
|
|
||||||
|
docker run \
|
||||||
|
--pull always \
|
||||||
|
--device /dev/dri \
|
||||||
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||||
|
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||||
|
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||||
|
--ipc=host \
|
||||||
|
intel/vllm:xpu \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
|
--model $INFERENCE_MODEL \
|
||||||
|
--port $INFERENCE_PORT
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export SAFETY_PORT=8081
|
||||||
|
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
export ZE_AFFINITY_MASK=1
|
||||||
|
|
||||||
|
docker run \
|
||||||
|
--pull always \
|
||||||
|
--device /dev/dri \
|
||||||
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||||
|
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||||
|
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||||
|
--ipc=host \
|
||||||
|
intel/vllm:xpu \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
|
--model $SAFETY_MODEL \
|
||||||
|
--port $SAFETY_PORT
|
||||||
|
```
|
||||||
|
|
||||||
## Running Llama Stack
|
## Running Llama Stack
|
||||||
|
|
||||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
|
|
|
@ -225,8 +225,18 @@ class AgentConfigCommon(BaseModel):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class AgentConfig(AgentConfigCommon):
|
class AgentConfig(AgentConfigCommon):
|
||||||
|
"""Configuration for an agent.
|
||||||
|
|
||||||
|
:param model: The model identifier to use for the agent
|
||||||
|
:param instructions: The system instructions for the agent
|
||||||
|
:param name: Optional name for the agent, used in telemetry and identification
|
||||||
|
:param enable_session_persistence: Optional flag indicating whether session data has to be persisted
|
||||||
|
:param response_format: Optional response format configuration
|
||||||
|
"""
|
||||||
|
|
||||||
model: str
|
model: str
|
||||||
instructions: str
|
instructions: str
|
||||||
|
name: Optional[str] = None
|
||||||
enable_session_persistence: Optional[bool] = False
|
enable_session_persistence: Optional[bool] = False
|
||||||
response_format: Optional[ResponseFormat] = None
|
response_format: Optional[ResponseFormat] = None
|
||||||
|
|
||||||
|
|
|
@ -526,9 +526,9 @@ class OpenAIAssistantMessageParam(BaseModel):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
role: Literal["assistant"] = "assistant"
|
role: Literal["assistant"] = "assistant"
|
||||||
content: OpenAIChatCompletionMessageContent
|
content: Optional[OpenAIChatCompletionMessageContent] = None
|
||||||
name: Optional[str] = None
|
name: Optional[str] = None
|
||||||
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list)
|
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
@ -235,10 +235,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
except (Exception, RuntimeError) as exc:
|
except (Exception, RuntimeError) as exc:
|
||||||
|
import traceback
|
||||||
|
|
||||||
cprint(
|
cprint(
|
||||||
f"Error building stack: {exc}",
|
f"Error building stack: {exc}",
|
||||||
color="red",
|
color="red",
|
||||||
)
|
)
|
||||||
|
cprint("Stack trace:", color="red")
|
||||||
|
traceback.print_exc()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
if run_config is None:
|
if run_config is None:
|
||||||
cprint(
|
cprint(
|
||||||
|
@ -350,7 +354,7 @@ def _run_stack_build_command_from_build_config(
|
||||||
build_config,
|
build_config,
|
||||||
build_file_path,
|
build_file_path,
|
||||||
image_name,
|
image_name,
|
||||||
template_or_config=template_name or config_path,
|
template_or_config=template_name or config_path or str(build_file_path),
|
||||||
)
|
)
|
||||||
if return_code != 0:
|
if return_code != 0:
|
||||||
raise RuntimeError(f"Failed to build image {image_name}")
|
raise RuntimeError(f"Failed to build image {image_name}")
|
||||||
|
|
|
@ -37,6 +37,17 @@ def tool_chat_page():
|
||||||
label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
|
label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if "builtin::rag" in toolgroup_selection:
|
||||||
|
vector_dbs = llama_stack_api.client.vector_dbs.list() or []
|
||||||
|
if not vector_dbs:
|
||||||
|
st.info("No vector databases available for selection.")
|
||||||
|
vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
|
||||||
|
selected_vector_dbs = st.multiselect(
|
||||||
|
label="Select Document Collections to use in RAG queries",
|
||||||
|
options=vector_dbs,
|
||||||
|
on_change=reset_agent,
|
||||||
|
)
|
||||||
|
|
||||||
st.subheader("MCP Servers")
|
st.subheader("MCP Servers")
|
||||||
mcp_selection = st.pills(
|
mcp_selection = st.pills(
|
||||||
label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
|
label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
|
||||||
|
@ -56,6 +67,27 @@ def tool_chat_page():
|
||||||
st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}")
|
st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}")
|
||||||
st.json(active_tool_list)
|
st.json(active_tool_list)
|
||||||
|
|
||||||
|
st.subheader("Chat Configurations")
|
||||||
|
max_tokens = st.slider(
|
||||||
|
"Max Tokens",
|
||||||
|
min_value=0,
|
||||||
|
max_value=4096,
|
||||||
|
value=512,
|
||||||
|
step=1,
|
||||||
|
help="The maximum number of tokens to generate",
|
||||||
|
on_change=reset_agent,
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, tool_name in enumerate(toolgroup_selection):
|
||||||
|
if tool_name == "builtin::rag":
|
||||||
|
tool_dict = dict(
|
||||||
|
name="builtin::rag",
|
||||||
|
args={
|
||||||
|
"vector_db_ids": list(selected_vector_dbs),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
toolgroup_selection[i] = tool_dict
|
||||||
|
|
||||||
@st.cache_resource
|
@st.cache_resource
|
||||||
def create_agent():
|
def create_agent():
|
||||||
return Agent(
|
return Agent(
|
||||||
|
@ -63,9 +95,7 @@ def tool_chat_page():
|
||||||
model=model,
|
model=model,
|
||||||
instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
|
instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
|
||||||
tools=toolgroup_selection,
|
tools=toolgroup_selection,
|
||||||
sampling_params={
|
sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
|
||||||
"strategy": {"type": "greedy"},
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
agent = create_agent()
|
agent = create_agent()
|
||||||
|
|
|
@ -178,6 +178,8 @@ class ChatAgent(ShieldRunnerMixin):
|
||||||
span.set_attribute("request", request.model_dump_json())
|
span.set_attribute("request", request.model_dump_json())
|
||||||
turn_id = str(uuid.uuid4())
|
turn_id = str(uuid.uuid4())
|
||||||
span.set_attribute("turn_id", turn_id)
|
span.set_attribute("turn_id", turn_id)
|
||||||
|
if self.agent_config.name:
|
||||||
|
span.set_attribute("agent_name", self.agent_config.name)
|
||||||
|
|
||||||
await self._initialize_tools(request.toolgroups)
|
await self._initialize_tools(request.toolgroups)
|
||||||
async for chunk in self._run_turn(request, turn_id):
|
async for chunk in self._run_turn(request, turn_id):
|
||||||
|
@ -190,6 +192,8 @@ class ChatAgent(ShieldRunnerMixin):
|
||||||
span.set_attribute("session_id", request.session_id)
|
span.set_attribute("session_id", request.session_id)
|
||||||
span.set_attribute("request", request.model_dump_json())
|
span.set_attribute("request", request.model_dump_json())
|
||||||
span.set_attribute("turn_id", request.turn_id)
|
span.set_attribute("turn_id", request.turn_id)
|
||||||
|
if self.agent_config.name:
|
||||||
|
span.set_attribute("agent_name", self.agent_config.name)
|
||||||
|
|
||||||
await self._initialize_tools()
|
await self._initialize_tools()
|
||||||
async for chunk in self._run_turn(request):
|
async for chunk in self._run_turn(request):
|
||||||
|
@ -498,6 +502,8 @@ class ChatAgent(ShieldRunnerMixin):
|
||||||
stop_reason = None
|
stop_reason = None
|
||||||
|
|
||||||
async with tracing.span("inference") as span:
|
async with tracing.span("inference") as span:
|
||||||
|
if self.agent_config.name:
|
||||||
|
span.set_attribute("agent_name", self.agent_config.name)
|
||||||
async for chunk in await self.inference_api.chat_completion(
|
async for chunk in await self.inference_api.chat_completion(
|
||||||
self.agent_config.model,
|
self.agent_config.model,
|
||||||
input_messages,
|
input_messages,
|
||||||
|
|
85
llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Normal file
85
llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Normal file
|
@ -0,0 +1,85 @@
|
||||||
|
# NVIDIA Inference Provider for LlamaStack
|
||||||
|
|
||||||
|
This provider enables running inference using NVIDIA NIM.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- Endpoints for completions, chat completions, and embeddings for registered models
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- LlamaStack with NVIDIA configuration
|
||||||
|
- Access to NVIDIA NIM deployment
|
||||||
|
- NIM for model to use for inference is deployed
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
Build the NVIDIA environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama stack build --template nvidia --image-type conda
|
||||||
|
```
|
||||||
|
|
||||||
|
### Basic Usage using the LlamaStack Python Client
|
||||||
|
|
||||||
|
#### Initialize the client
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ["NVIDIA_API_KEY"] = (
|
||||||
|
"" # Required if using hosted NIM endpoint. If self-hosted, not required.
|
||||||
|
)
|
||||||
|
os.environ["NVIDIA_BASE_URL"] = "http://nim.test" # NIM URL
|
||||||
|
|
||||||
|
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||||
|
|
||||||
|
client = LlamaStackAsLibraryClient("nvidia")
|
||||||
|
client.initialize()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Completion
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = client.completion(
|
||||||
|
model_id="meta-llama/Llama-3.1-8b-Instruct",
|
||||||
|
content="Complete the sentence using one word: Roses are red, violets are :",
|
||||||
|
stream=False,
|
||||||
|
sampling_params={
|
||||||
|
"max_tokens": 50,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
print(f"Response: {response.content}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Chat Completion
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = client.chat_completion(
|
||||||
|
model_id="meta-llama/Llama-3.1-8b-Instruct",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You must respond to each message with only one word",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Complete the sentence using one word: Roses are red, violets are:",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
stream=False,
|
||||||
|
sampling_params={
|
||||||
|
"max_tokens": 50,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
print(f"Response: {response.completion_message.content}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Embeddings
|
||||||
|
```python
|
||||||
|
response = client.embeddings(
|
||||||
|
model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
|
||||||
|
)
|
||||||
|
print(f"Embeddings: {response.embeddings}")
|
||||||
|
```
|
|
@ -48,6 +48,10 @@ MODEL_ENTRIES = [
|
||||||
"meta/llama-3.2-90b-vision-instruct",
|
"meta/llama-3.2-90b-vision-instruct",
|
||||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
CoreModelId.llama3_2_90b_vision_instruct.value,
|
||||||
),
|
),
|
||||||
|
build_hf_repo_model_entry(
|
||||||
|
"meta/llama-3.3-70b-instruct",
|
||||||
|
CoreModelId.llama3_3_70b_instruct.value,
|
||||||
|
),
|
||||||
# NeMo Retriever Text Embedding models -
|
# NeMo Retriever Text Embedding models -
|
||||||
#
|
#
|
||||||
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
||||||
|
|
|
@ -374,7 +374,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
options["max_tokens"] = self.config.max_tokens
|
options["max_tokens"] = self.config.max_tokens
|
||||||
|
|
||||||
input_dict: dict[str, Any] = {}
|
input_dict: dict[str, Any] = {}
|
||||||
if isinstance(request, ChatCompletionRequest) and request.tools is not None:
|
# Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
|
||||||
|
if isinstance(request, ChatCompletionRequest) and request.tools:
|
||||||
input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
|
input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
|
||||||
|
|
||||||
if isinstance(request, ChatCompletionRequest):
|
if isinstance(request, ChatCompletionRequest):
|
||||||
|
|
|
@ -16,7 +16,11 @@ _MODEL_ENTRIES = [
|
||||||
build_hf_repo_model_entry(
|
build_hf_repo_model_entry(
|
||||||
"meta/llama-3.1-8b-instruct",
|
"meta/llama-3.1-8b-instruct",
|
||||||
CoreModelId.llama3_1_8b_instruct.value,
|
CoreModelId.llama3_1_8b_instruct.value,
|
||||||
)
|
),
|
||||||
|
build_hf_repo_model_entry(
|
||||||
|
"meta/llama-3.2-1b-instruct",
|
||||||
|
CoreModelId.llama3_2_1b_instruct.value,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,11 +27,12 @@ from .models import _MODEL_ENTRIES
|
||||||
|
|
||||||
# Map API status to JobStatus enum
|
# Map API status to JobStatus enum
|
||||||
STATUS_MAPPING = {
|
STATUS_MAPPING = {
|
||||||
"running": "in_progress",
|
"running": JobStatus.in_progress.value,
|
||||||
"completed": "completed",
|
"completed": JobStatus.completed.value,
|
||||||
"failed": "failed",
|
"failed": JobStatus.failed.value,
|
||||||
"cancelled": "cancelled",
|
"cancelled": JobStatus.cancelled.value,
|
||||||
"pending": "scheduled",
|
"pending": JobStatus.scheduled.value,
|
||||||
|
"unknown": JobStatus.scheduled.value,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
77
llama_stack/providers/remote/safety/nvidia/README.md
Normal file
77
llama_stack/providers/remote/safety/nvidia/README.md
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
# NVIDIA Safety Provider for LlamaStack
|
||||||
|
|
||||||
|
This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Run safety checks for messages
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- LlamaStack with NVIDIA configuration
|
||||||
|
- Access to NVIDIA NeMo Guardrails service
|
||||||
|
- NIM for model to use for safety check is deployed
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
Build the NVIDIA environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama stack build --template nvidia --image-type conda
|
||||||
|
```
|
||||||
|
|
||||||
|
### Basic Usage using the LlamaStack Python Client
|
||||||
|
|
||||||
|
#### Initialize the client
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ["NVIDIA_API_KEY"] = "your-api-key"
|
||||||
|
os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
|
||||||
|
|
||||||
|
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||||
|
|
||||||
|
client = LlamaStackAsLibraryClient("nvidia")
|
||||||
|
client.initialize()
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Create a safety shield
|
||||||
|
|
||||||
|
```python
|
||||||
|
from llama_stack.apis.safety import Shield
|
||||||
|
from llama_stack.apis.inference import Message
|
||||||
|
|
||||||
|
# Create a safety shield
|
||||||
|
shield = Shield(
|
||||||
|
shield_id="your-shield-id",
|
||||||
|
provider_resource_id="safety-model-id", # The model to use for safety checks
|
||||||
|
description="Safety checks for content moderation",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register the shield
|
||||||
|
await client.safety.register_shield(shield)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Run safety checks
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Messages to check
|
||||||
|
messages = [Message(role="user", content="Your message to check")]
|
||||||
|
|
||||||
|
# Run safety check
|
||||||
|
response = await client.safety.run_shield(
|
||||||
|
shield_id="your-shield-id",
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for violations
|
||||||
|
if response.violation:
|
||||||
|
print(f"Safety violation detected: {response.violation.user_message}")
|
||||||
|
print(f"Violation level: {response.violation.violation_level}")
|
||||||
|
print(f"Metadata: {response.violation.metadata}")
|
||||||
|
else:
|
||||||
|
print("No safety violations detected")
|
||||||
|
```
|
|
@ -25,14 +25,84 @@ The following models are available by default:
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
|
||||||
### Prerequisite: API Keys
|
## Prerequisites
|
||||||
|
### NVIDIA API Keys
|
||||||
|
|
||||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||||
|
|
||||||
|
### Deploy NeMo Microservices Platform
|
||||||
|
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||||
|
|
||||||
|
## Supported Services
|
||||||
|
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||||
|
|
||||||
|
### Inference: NVIDIA NIM
|
||||||
|
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||||
|
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||||
|
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||||
|
|
||||||
|
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||||
|
|
||||||
|
### Datasetio API: NeMo Data Store
|
||||||
|
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||||
|
|
||||||
|
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||||
|
|
||||||
|
### Eval API: NeMo Evaluator
|
||||||
|
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||||
|
|
||||||
|
### Post-Training API: NeMo Customizer
|
||||||
|
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||||
|
|
||||||
|
### Safety API: NeMo Guardrails
|
||||||
|
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the NVIDIA Safety docs for supported features and example usage.
|
||||||
|
|
||||||
|
## Deploying models
|
||||||
|
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||||
|
|
||||||
|
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||||
|
```sh
|
||||||
|
# URL to NeMo NIM Proxy service
|
||||||
|
export NEMO_URL="http://nemo.test"
|
||||||
|
|
||||||
|
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{
|
||||||
|
"name": "llama-3.2-1b-instruct",
|
||||||
|
"namespace": "meta",
|
||||||
|
"config": {
|
||||||
|
"model": "meta/llama-3.2-1b-instruct",
|
||||||
|
"nim_deployment": {
|
||||||
|
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||||
|
"image_tag": "1.8.3",
|
||||||
|
"pvc_size": "25Gi",
|
||||||
|
"gpu": 1,
|
||||||
|
"additional_envs": {
|
||||||
|
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||||
|
|
||||||
|
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||||
|
```sh
|
||||||
|
export NEMO_URL="http://nemo.test"
|
||||||
|
|
||||||
|
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||||
|
```
|
||||||
|
|
||||||
## Running Llama Stack with NVIDIA
|
## Running Llama Stack with NVIDIA
|
||||||
|
|
||||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -54,9 +124,27 @@ docker run \
|
||||||
### Via Conda
|
### Via Conda
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
llama stack build --template nvidia --image-type conda
|
llama stack build --template nvidia --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 8321 \
|
--port 8321 \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Via venv
|
||||||
|
|
||||||
|
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
|
llama stack build --template nvidia --image-type venv
|
||||||
|
llama stack run ./run.yaml \
|
||||||
|
--port 8321 \
|
||||||
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Notebooks
|
||||||
|
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||||
|
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||||
|
|
|
@ -170,6 +170,16 @@ models:
|
||||||
provider_id: nvidia
|
provider_id: nvidia
|
||||||
provider_model_id: meta/llama-3.2-90b-vision-instruct
|
provider_model_id: meta/llama-3.2-90b-vision-instruct
|
||||||
model_type: llm
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: meta/llama-3.3-70b-instruct
|
||||||
|
provider_id: nvidia
|
||||||
|
provider_model_id: meta/llama-3.3-70b-instruct
|
||||||
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: meta-llama/Llama-3.3-70B-Instruct
|
||||||
|
provider_id: nvidia
|
||||||
|
provider_model_id: meta/llama-3.3-70b-instruct
|
||||||
|
model_type: llm
|
||||||
- metadata:
|
- metadata:
|
||||||
embedding_dimension: 2048
|
embedding_dimension: 2048
|
||||||
context_length: 8192
|
context_length: 8192
|
||||||
|
|
|
@ -28,7 +28,7 @@ The following environment variables can be configured:
|
||||||
|
|
||||||
## Setting up vLLM server
|
## Setting up vLLM server
|
||||||
|
|
||||||
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
|
In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
|
||||||
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
||||||
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
||||||
that we only use GPUs here for demonstration purposes.
|
that we only use GPUs here for demonstration purposes.
|
||||||
|
@ -149,6 +149,55 @@ docker run \
|
||||||
--port $SAFETY_PORT
|
--port $SAFETY_PORT
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Setting up vLLM server on Intel GPU
|
||||||
|
|
||||||
|
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
|
||||||
|
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
|
||||||
|
|
||||||
|
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export INFERENCE_PORT=8000
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
export ZE_AFFINITY_MASK=0
|
||||||
|
|
||||||
|
docker run \
|
||||||
|
--pull always \
|
||||||
|
--device /dev/dri \
|
||||||
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||||
|
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||||
|
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||||
|
--ipc=host \
|
||||||
|
intel/vllm:xpu \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
|
--model $INFERENCE_MODEL \
|
||||||
|
--port $INFERENCE_PORT
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export SAFETY_PORT=8081
|
||||||
|
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
export ZE_AFFINITY_MASK=1
|
||||||
|
|
||||||
|
docker run \
|
||||||
|
--pull always \
|
||||||
|
--device /dev/dri \
|
||||||
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||||
|
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||||
|
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||||
|
--ipc=host \
|
||||||
|
intel/vllm:xpu \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
|
--model $SAFETY_MODEL \
|
||||||
|
--port $SAFETY_PORT
|
||||||
|
```
|
||||||
|
|
||||||
## Running Llama Stack
|
## Running Llama Stack
|
||||||
|
|
||||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
|
|
|
@ -115,6 +115,70 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
|
||||||
assert "I can't" in logs_str
|
assert "I can't" in logs_str
|
||||||
|
|
||||||
|
|
||||||
|
def test_agent_name(llama_stack_client, text_model_id):
|
||||||
|
agent_name = f"test-agent-{uuid4()}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
agent = Agent(
|
||||||
|
llama_stack_client,
|
||||||
|
model=text_model_id,
|
||||||
|
instructions="You are a helpful assistant",
|
||||||
|
name=agent_name,
|
||||||
|
)
|
||||||
|
except TypeError:
|
||||||
|
agent = Agent(
|
||||||
|
llama_stack_client,
|
||||||
|
model=text_model_id,
|
||||||
|
instructions="You are a helpful assistant",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
session_id = agent.create_session(f"test-session-{uuid4()}")
|
||||||
|
|
||||||
|
agent.create_turn(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Give me a sentence that contains the word: hello",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
session_id=session_id,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
all_spans = []
|
||||||
|
for span in llama_stack_client.telemetry.query_spans(
|
||||||
|
attribute_filters=[
|
||||||
|
{"key": "session_id", "op": "eq", "value": session_id},
|
||||||
|
],
|
||||||
|
attributes_to_return=["input", "output", "agent_name", "agent_id", "session_id"],
|
||||||
|
):
|
||||||
|
all_spans.append(span.attributes)
|
||||||
|
|
||||||
|
agent_name_spans = []
|
||||||
|
for span in llama_stack_client.telemetry.query_spans(
|
||||||
|
attribute_filters=[],
|
||||||
|
attributes_to_return=["agent_name"],
|
||||||
|
):
|
||||||
|
if "agent_name" in span.attributes:
|
||||||
|
agent_name_spans.append(span.attributes)
|
||||||
|
|
||||||
|
agent_logs = []
|
||||||
|
for span in llama_stack_client.telemetry.query_spans(
|
||||||
|
attribute_filters=[
|
||||||
|
{"key": "agent_name", "op": "eq", "value": agent_name},
|
||||||
|
],
|
||||||
|
attributes_to_return=["input", "output", "agent_name"],
|
||||||
|
):
|
||||||
|
if "output" in span.attributes and span.attributes["output"] != "no shields":
|
||||||
|
agent_logs.append(span.attributes)
|
||||||
|
|
||||||
|
assert len(agent_logs) == 1
|
||||||
|
assert agent_logs[0]["agent_name"] == agent_name
|
||||||
|
assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
|
||||||
|
assert "hello" in agent_logs[0]["output"].lower()
|
||||||
|
|
||||||
|
|
||||||
def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
|
def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
|
||||||
common_params = dict(
|
common_params = dict(
|
||||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
|
|
@ -31,6 +31,7 @@ def data_url_from_file(file_path: str) -> str:
|
||||||
return data_url
|
return data_url
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="flaky. Couldn't find 'llamastack/simpleqa' on the Hugging Face Hub")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"purpose, source, provider_id, limit",
|
"purpose, source, provider_id, limit",
|
||||||
[
|
[
|
||||||
|
|
38
tests/unit/distribution/test_build_path.py
Normal file
38
tests/unit/distribution/test_build_path.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from llama_stack.cli.stack._build import (
|
||||||
|
_run_stack_build_command_from_build_config,
|
||||||
|
)
|
||||||
|
from llama_stack.distribution.datatypes import BuildConfig, DistributionSpec
|
||||||
|
from llama_stack.distribution.utils.image_types import LlamaStackImageType
|
||||||
|
|
||||||
|
|
||||||
|
def test_container_build_passes_path(monkeypatch, tmp_path):
|
||||||
|
called_with = {}
|
||||||
|
|
||||||
|
def spy_build_image(cfg, build_file_path, image_name, template_or_config):
|
||||||
|
called_with["path"] = template_or_config
|
||||||
|
return 0
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"llama_stack.cli.stack._build.build_image",
|
||||||
|
spy_build_image,
|
||||||
|
raising=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
cfg = BuildConfig(
|
||||||
|
image_type=LlamaStackImageType.CONTAINER.value,
|
||||||
|
distribution_spec=DistributionSpec(providers={}, description=""),
|
||||||
|
)
|
||||||
|
|
||||||
|
_run_stack_build_command_from_build_config(cfg, image_name="dummy")
|
||||||
|
|
||||||
|
assert "path" in called_with
|
||||||
|
assert isinstance(called_with["path"], str)
|
||||||
|
assert Path(called_with["path"]).exists()
|
|
@ -26,7 +26,12 @@ from openai.types.chat.chat_completion_chunk import (
|
||||||
)
|
)
|
||||||
from openai.types.model import Model as OpenAIModel
|
from openai.types.model import Model as OpenAIModel
|
||||||
|
|
||||||
from llama_stack.apis.inference import ToolChoice, ToolConfig
|
from llama_stack.apis.inference import (
|
||||||
|
ChatCompletionRequest,
|
||||||
|
ToolChoice,
|
||||||
|
ToolConfig,
|
||||||
|
UserMessage,
|
||||||
|
)
|
||||||
from llama_stack.apis.models import Model
|
from llama_stack.apis.models import Model
|
||||||
from llama_stack.models.llama.datatypes import StopReason
|
from llama_stack.models.llama.datatypes import StopReason
|
||||||
from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig
|
from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig
|
||||||
|
@ -232,3 +237,14 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
|
||||||
# above.
|
# above.
|
||||||
asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
|
asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
|
||||||
assert not asyncio_warnings
|
assert not asyncio_warnings
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_params_empty_tools(vllm_inference_adapter):
|
||||||
|
request = ChatCompletionRequest(
|
||||||
|
tools=[],
|
||||||
|
model="test_model",
|
||||||
|
messages=[UserMessage(content="test")],
|
||||||
|
)
|
||||||
|
params = await vllm_inference_adapter._get_params(request)
|
||||||
|
assert "tools" not in params
|
||||||
|
|
|
@ -200,35 +200,48 @@ class TestNvidiaPostTraining(unittest.TestCase):
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_get_training_job_status(self):
|
def test_get_training_job_status(self):
|
||||||
self.mock_make_request.return_value = {
|
customizer_status_to_job_status = [
|
||||||
"created_at": "2024-12-09T04:06:28.580220",
|
("running", "in_progress"),
|
||||||
"updated_at": "2024-12-09T04:21:19.852832",
|
("completed", "completed"),
|
||||||
"status": "completed",
|
("failed", "failed"),
|
||||||
"steps_completed": 1210,
|
("cancelled", "cancelled"),
|
||||||
"epochs_completed": 2,
|
("pending", "scheduled"),
|
||||||
"percentage_done": 100.0,
|
("unknown", "scheduled"),
|
||||||
"best_epoch": 2,
|
]
|
||||||
"train_loss": 1.718016266822815,
|
|
||||||
"val_loss": 1.8661999702453613,
|
|
||||||
}
|
|
||||||
|
|
||||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
for customizer_status, expected_status in customizer_status_to_job_status:
|
||||||
|
with self.subTest(customizer_status=customizer_status, expected_status=expected_status):
|
||||||
|
self.mock_make_request.return_value = {
|
||||||
|
"created_at": "2024-12-09T04:06:28.580220",
|
||||||
|
"updated_at": "2024-12-09T04:21:19.852832",
|
||||||
|
"status": customizer_status,
|
||||||
|
"steps_completed": 1210,
|
||||||
|
"epochs_completed": 2,
|
||||||
|
"percentage_done": 100.0,
|
||||||
|
"best_epoch": 2,
|
||||||
|
"train_loss": 1.718016266822815,
|
||||||
|
"val_loss": 1.8661999702453613,
|
||||||
|
}
|
||||||
|
|
||||||
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
|
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||||
|
|
||||||
assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
|
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
|
||||||
assert status.status.value == "completed"
|
|
||||||
assert status.steps_completed == 1210
|
|
||||||
assert status.epochs_completed == 2
|
|
||||||
assert status.percentage_done == 100.0
|
|
||||||
assert status.best_epoch == 2
|
|
||||||
assert status.train_loss == 1.718016266822815
|
|
||||||
assert status.val_loss == 1.8661999702453613
|
|
||||||
|
|
||||||
self.mock_make_request.assert_called_once()
|
assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
|
||||||
self._assert_request(
|
assert status.status.value == expected_status
|
||||||
self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id}
|
assert status.steps_completed == 1210
|
||||||
)
|
assert status.epochs_completed == 2
|
||||||
|
assert status.percentage_done == 100.0
|
||||||
|
assert status.best_epoch == 2
|
||||||
|
assert status.train_loss == 1.718016266822815
|
||||||
|
assert status.val_loss == 1.8661999702453613
|
||||||
|
|
||||||
|
self._assert_request(
|
||||||
|
self.mock_make_request,
|
||||||
|
"GET",
|
||||||
|
f"/v1/customization/jobs/{job_id}/status",
|
||||||
|
expected_params={"job_id": job_id},
|
||||||
|
)
|
||||||
|
|
||||||
def test_get_training_jobs(self):
|
def test_get_training_jobs(self):
|
||||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue