mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 09:21:45 +00:00
Merge branch 'main' into nvidia-eval-integration
This commit is contained in:
commit
2117af25a7
27 changed files with 748 additions and 159 deletions
6
.github/workflows/providers-build.yml
vendored
6
.github/workflows/providers-build.yml
vendored
|
@ -86,15 +86,15 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
|
|
29
docs/_static/js/detect_theme.js
vendored
29
docs/_static/js/detect_theme.js
vendored
|
@ -1,9 +1,32 @@
|
|||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
||||
const htmlElement = document.documentElement;
|
||||
if (prefersDark) {
|
||||
htmlElement.setAttribute("data-theme", "dark");
|
||||
|
||||
// Check if theme is saved in localStorage
|
||||
const savedTheme = localStorage.getItem("sphinx-rtd-theme");
|
||||
|
||||
if (savedTheme) {
|
||||
// Use the saved theme preference
|
||||
htmlElement.setAttribute("data-theme", savedTheme);
|
||||
document.body.classList.toggle("dark", savedTheme === "dark");
|
||||
} else {
|
||||
htmlElement.setAttribute("data-theme", "light");
|
||||
// Fall back to system preference
|
||||
const theme = prefersDark ? "dark" : "light";
|
||||
htmlElement.setAttribute("data-theme", theme);
|
||||
document.body.classList.toggle("dark", theme === "dark");
|
||||
// Save initial preference
|
||||
localStorage.setItem("sphinx-rtd-theme", theme);
|
||||
}
|
||||
|
||||
// Listen for theme changes from the existing toggle
|
||||
const observer = new MutationObserver(function(mutations) {
|
||||
mutations.forEach(function(mutation) {
|
||||
if (mutation.attributeName === "data-theme") {
|
||||
const currentTheme = htmlElement.getAttribute("data-theme");
|
||||
localStorage.setItem("sphinx-rtd-theme", currentTheme);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
observer.observe(htmlElement, { attributes: true });
|
||||
});
|
||||
|
|
22
docs/_static/llama-stack-spec.html
vendored
22
docs/_static/llama-stack-spec.html
vendored
|
@ -5221,17 +5221,25 @@
|
|||
"default": 10
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The model identifier to use for the agent"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The system instructions for the agent"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Optional name for the agent, used in telemetry and identification"
|
||||
},
|
||||
"enable_session_persistence": {
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
"default": false,
|
||||
"description": "Optional flag indicating whether session data has to be persisted"
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat"
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "Optional response format configuration"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -5239,7 +5247,8 @@
|
|||
"model",
|
||||
"instructions"
|
||||
],
|
||||
"title": "AgentConfig"
|
||||
"title": "AgentConfig",
|
||||
"description": "Configuration for an agent."
|
||||
},
|
||||
"AgentTool": {
|
||||
"oneOf": [
|
||||
|
@ -8891,8 +8900,7 @@
|
|||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"role",
|
||||
"content"
|
||||
"role"
|
||||
],
|
||||
"title": "OpenAIAssistantMessageParam",
|
||||
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
|
||||
|
|
12
docs/_static/llama-stack-spec.yaml
vendored
12
docs/_static/llama-stack-spec.yaml
vendored
|
@ -3686,18 +3686,29 @@ components:
|
|||
default: 10
|
||||
model:
|
||||
type: string
|
||||
description: >-
|
||||
The model identifier to use for the agent
|
||||
instructions:
|
||||
type: string
|
||||
description: The system instructions for the agent
|
||||
name:
|
||||
type: string
|
||||
description: >-
|
||||
Optional name for the agent, used in telemetry and identification
|
||||
enable_session_persistence:
|
||||
type: boolean
|
||||
default: false
|
||||
description: >-
|
||||
Optional flag indicating whether session data has to be persisted
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
description: Optional response format configuration
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- instructions
|
||||
title: AgentConfig
|
||||
description: Configuration for an agent.
|
||||
AgentTool:
|
||||
oneOf:
|
||||
- type: string
|
||||
|
@ -6097,7 +6108,6 @@ components:
|
|||
additionalProperties: false
|
||||
required:
|
||||
- role
|
||||
- content
|
||||
title: OpenAIAssistantMessageParam
|
||||
description: >-
|
||||
A message containing the model's (assistant) response in an OpenAI-compatible
|
||||
|
|
|
@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
|
|||
Add the following dependency in your `build.gradle.kts` file:
|
||||
```
|
||||
dependencies {
|
||||
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
|
||||
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
|
||||
}
|
||||
```
|
||||
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
|
||||
|
@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
|
|||
|
||||
Include the ExecuTorch library by:
|
||||
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
|
||||
2. Move the script to the top level of your Android app where the app directory resides:
|
||||
<p align="center">
|
||||
<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
|
||||
</p>
|
||||
|
||||
2. Move the script to the top level of your Android app where the `app` directory resides.
|
||||
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
|
||||
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
|
||||
```
|
||||
|
@ -52,6 +48,8 @@ dependencies {
|
|||
}
|
||||
```
|
||||
|
||||
See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
|
||||
|
||||
## Llama Stack APIs in Your Android App
|
||||
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
|
||||
|
||||
|
@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
|
|||
```
|
||||
conda create -n stack-fireworks python=3.10
|
||||
conda activate stack-fireworks
|
||||
pip install --no-cache llama-stack==0.1.4
|
||||
pip install --no-cache llama-stack==0.2.2
|
||||
llama stack build --template fireworks --image-type conda
|
||||
export FIREWORKS_API_KEY=<SOME_KEY>
|
||||
llama stack run fireworks --port 5050
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# NVIDIA Distribution
|
||||
|
||||
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs` |
|
||||
| eval | `remote::nvidia` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
| scoring | `inline::basic` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss` |
|
||||
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
|
||||
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
|
||||
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
||||
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
|
||||
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
|
||||
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
||||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=8321
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-nvidia \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
|
@ -46,20 +46,91 @@ The following models are available by default:
|
|||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
## Prerequisites
|
||||
### NVIDIA API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
||||
### Inference: NVIDIA NIM
|
||||
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||
|
||||
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||
|
||||
### Datasetio API: NeMo Data Store
|
||||
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||
|
||||
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Eval API: NeMo Evaluator
|
||||
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Post-Training API: NeMo Customizer
|
||||
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Safety API: NeMo Guardrails
|
||||
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the NVIDIA Safety docs for supported features and example usage.
|
||||
|
||||
## Deploying models
|
||||
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||
|
||||
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||
```sh
|
||||
# URL to NeMo NIM Proxy service
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"name": "llama-3.2-1b-instruct",
|
||||
"namespace": "meta",
|
||||
"config": {
|
||||
"model": "meta/llama-3.2-1b-instruct",
|
||||
"nim_deployment": {
|
||||
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||
"image_tag": "1.8.3",
|
||||
"pvc_size": "25Gi",
|
||||
"gpu": 1,
|
||||
"additional_envs": {
|
||||
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||
```
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
|
@ -81,9 +152,27 @@ docker run \
|
|||
### Via Conda
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type venv
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Example Notebooks
|
||||
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||
|
|
|
@ -41,7 +41,7 @@ The following environment variables can be configured:
|
|||
|
||||
## Setting up vLLM server
|
||||
|
||||
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
|
||||
In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
|
||||
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
||||
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
||||
that we only use GPUs here for demonstration purposes.
|
||||
|
@ -162,6 +162,55 @@ docker run \
|
|||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
### Setting up vLLM server on Intel GPU
|
||||
|
||||
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
|
||||
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
|
||||
|
||||
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
|
||||
|
||||
```bash
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
|
||||
export ZE_AFFINITY_MASK=0
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
intel/vllm:xpu \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
```bash
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export ZE_AFFINITY_MASK=1
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
intel/vllm:xpu \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
|
|
@ -225,8 +225,18 @@ class AgentConfigCommon(BaseModel):
|
|||
|
||||
@json_schema_type
|
||||
class AgentConfig(AgentConfigCommon):
|
||||
"""Configuration for an agent.
|
||||
|
||||
:param model: The model identifier to use for the agent
|
||||
:param instructions: The system instructions for the agent
|
||||
:param name: Optional name for the agent, used in telemetry and identification
|
||||
:param enable_session_persistence: Optional flag indicating whether session data has to be persisted
|
||||
:param response_format: Optional response format configuration
|
||||
"""
|
||||
|
||||
model: str
|
||||
instructions: str
|
||||
name: Optional[str] = None
|
||||
enable_session_persistence: Optional[bool] = False
|
||||
response_format: Optional[ResponseFormat] = None
|
||||
|
||||
|
|
|
@ -526,9 +526,9 @@ class OpenAIAssistantMessageParam(BaseModel):
|
|||
"""
|
||||
|
||||
role: Literal["assistant"] = "assistant"
|
||||
content: OpenAIChatCompletionMessageContent
|
||||
content: Optional[OpenAIChatCompletionMessageContent] = None
|
||||
name: Optional[str] = None
|
||||
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list)
|
||||
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
@ -235,10 +235,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
|||
)
|
||||
|
||||
except (Exception, RuntimeError) as exc:
|
||||
import traceback
|
||||
|
||||
cprint(
|
||||
f"Error building stack: {exc}",
|
||||
color="red",
|
||||
)
|
||||
cprint("Stack trace:", color="red")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
if run_config is None:
|
||||
cprint(
|
||||
|
@ -350,7 +354,7 @@ def _run_stack_build_command_from_build_config(
|
|||
build_config,
|
||||
build_file_path,
|
||||
image_name,
|
||||
template_or_config=template_name or config_path,
|
||||
template_or_config=template_name or config_path or str(build_file_path),
|
||||
)
|
||||
if return_code != 0:
|
||||
raise RuntimeError(f"Failed to build image {image_name}")
|
||||
|
|
|
@ -37,6 +37,17 @@ def tool_chat_page():
|
|||
label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
|
||||
)
|
||||
|
||||
if "builtin::rag" in toolgroup_selection:
|
||||
vector_dbs = llama_stack_api.client.vector_dbs.list() or []
|
||||
if not vector_dbs:
|
||||
st.info("No vector databases available for selection.")
|
||||
vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
|
||||
selected_vector_dbs = st.multiselect(
|
||||
label="Select Document Collections to use in RAG queries",
|
||||
options=vector_dbs,
|
||||
on_change=reset_agent,
|
||||
)
|
||||
|
||||
st.subheader("MCP Servers")
|
||||
mcp_selection = st.pills(
|
||||
label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
|
||||
|
@ -56,6 +67,27 @@ def tool_chat_page():
|
|||
st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}")
|
||||
st.json(active_tool_list)
|
||||
|
||||
st.subheader("Chat Configurations")
|
||||
max_tokens = st.slider(
|
||||
"Max Tokens",
|
||||
min_value=0,
|
||||
max_value=4096,
|
||||
value=512,
|
||||
step=1,
|
||||
help="The maximum number of tokens to generate",
|
||||
on_change=reset_agent,
|
||||
)
|
||||
|
||||
for i, tool_name in enumerate(toolgroup_selection):
|
||||
if tool_name == "builtin::rag":
|
||||
tool_dict = dict(
|
||||
name="builtin::rag",
|
||||
args={
|
||||
"vector_db_ids": list(selected_vector_dbs),
|
||||
},
|
||||
)
|
||||
toolgroup_selection[i] = tool_dict
|
||||
|
||||
@st.cache_resource
|
||||
def create_agent():
|
||||
return Agent(
|
||||
|
@ -63,9 +95,7 @@ def tool_chat_page():
|
|||
model=model,
|
||||
instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
|
||||
tools=toolgroup_selection,
|
||||
sampling_params={
|
||||
"strategy": {"type": "greedy"},
|
||||
},
|
||||
sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
|
||||
)
|
||||
|
||||
agent = create_agent()
|
||||
|
|
|
@ -178,6 +178,8 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
span.set_attribute("request", request.model_dump_json())
|
||||
turn_id = str(uuid.uuid4())
|
||||
span.set_attribute("turn_id", turn_id)
|
||||
if self.agent_config.name:
|
||||
span.set_attribute("agent_name", self.agent_config.name)
|
||||
|
||||
await self._initialize_tools(request.toolgroups)
|
||||
async for chunk in self._run_turn(request, turn_id):
|
||||
|
@ -190,6 +192,8 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
span.set_attribute("session_id", request.session_id)
|
||||
span.set_attribute("request", request.model_dump_json())
|
||||
span.set_attribute("turn_id", request.turn_id)
|
||||
if self.agent_config.name:
|
||||
span.set_attribute("agent_name", self.agent_config.name)
|
||||
|
||||
await self._initialize_tools()
|
||||
async for chunk in self._run_turn(request):
|
||||
|
@ -498,6 +502,8 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
stop_reason = None
|
||||
|
||||
async with tracing.span("inference") as span:
|
||||
if self.agent_config.name:
|
||||
span.set_attribute("agent_name", self.agent_config.name)
|
||||
async for chunk in await self.inference_api.chat_completion(
|
||||
self.agent_config.model,
|
||||
input_messages,
|
||||
|
|
85
llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Normal file
85
llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Normal file
|
@ -0,0 +1,85 @@
|
|||
# NVIDIA Inference Provider for LlamaStack
|
||||
|
||||
This provider enables running inference using NVIDIA NIM.
|
||||
|
||||
## Features
|
||||
- Endpoints for completions, chat completions, and embeddings for registered models
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- LlamaStack with NVIDIA configuration
|
||||
- Access to NVIDIA NIM deployment
|
||||
- NIM for model to use for inference is deployed
|
||||
|
||||
### Setup
|
||||
|
||||
Build the NVIDIA environment:
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
```
|
||||
|
||||
### Basic Usage using the LlamaStack Python Client
|
||||
|
||||
#### Initialize the client
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ["NVIDIA_API_KEY"] = (
|
||||
"" # Required if using hosted NIM endpoint. If self-hosted, not required.
|
||||
)
|
||||
os.environ["NVIDIA_BASE_URL"] = "http://nim.test" # NIM URL
|
||||
|
||||
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||
|
||||
client = LlamaStackAsLibraryClient("nvidia")
|
||||
client.initialize()
|
||||
```
|
||||
|
||||
### Create Completion
|
||||
|
||||
```python
|
||||
response = client.completion(
|
||||
model_id="meta-llama/Llama-3.1-8b-Instruct",
|
||||
content="Complete the sentence using one word: Roses are red, violets are :",
|
||||
stream=False,
|
||||
sampling_params={
|
||||
"max_tokens": 50,
|
||||
},
|
||||
)
|
||||
print(f"Response: {response.content}")
|
||||
```
|
||||
|
||||
### Create Chat Completion
|
||||
|
||||
```python
|
||||
response = client.chat_completion(
|
||||
model_id="meta-llama/Llama-3.1-8b-Instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You must respond to each message with only one word",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Complete the sentence using one word: Roses are red, violets are:",
|
||||
},
|
||||
],
|
||||
stream=False,
|
||||
sampling_params={
|
||||
"max_tokens": 50,
|
||||
},
|
||||
)
|
||||
print(f"Response: {response.completion_message.content}")
|
||||
```
|
||||
|
||||
### Create Embeddings
|
||||
```python
|
||||
response = client.embeddings(
|
||||
model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
|
||||
)
|
||||
print(f"Embeddings: {response.embeddings}")
|
||||
```
|
|
@ -48,6 +48,10 @@ MODEL_ENTRIES = [
|
|||
"meta/llama-3.2-90b-vision-instruct",
|
||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
||||
),
|
||||
build_hf_repo_model_entry(
|
||||
"meta/llama-3.3-70b-instruct",
|
||||
CoreModelId.llama3_3_70b_instruct.value,
|
||||
),
|
||||
# NeMo Retriever Text Embedding models -
|
||||
#
|
||||
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
||||
|
|
|
@ -374,7 +374,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
options["max_tokens"] = self.config.max_tokens
|
||||
|
||||
input_dict: dict[str, Any] = {}
|
||||
if isinstance(request, ChatCompletionRequest) and request.tools is not None:
|
||||
# Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
|
||||
if isinstance(request, ChatCompletionRequest) and request.tools:
|
||||
input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
|
||||
|
||||
if isinstance(request, ChatCompletionRequest):
|
||||
|
|
|
@ -16,7 +16,11 @@ _MODEL_ENTRIES = [
|
|||
build_hf_repo_model_entry(
|
||||
"meta/llama-3.1-8b-instruct",
|
||||
CoreModelId.llama3_1_8b_instruct.value,
|
||||
)
|
||||
),
|
||||
build_hf_repo_model_entry(
|
||||
"meta/llama-3.2-1b-instruct",
|
||||
CoreModelId.llama3_2_1b_instruct.value,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -27,11 +27,12 @@ from .models import _MODEL_ENTRIES
|
|||
|
||||
# Map API status to JobStatus enum
|
||||
STATUS_MAPPING = {
|
||||
"running": "in_progress",
|
||||
"completed": "completed",
|
||||
"failed": "failed",
|
||||
"cancelled": "cancelled",
|
||||
"pending": "scheduled",
|
||||
"running": JobStatus.in_progress.value,
|
||||
"completed": JobStatus.completed.value,
|
||||
"failed": JobStatus.failed.value,
|
||||
"cancelled": JobStatus.cancelled.value,
|
||||
"pending": JobStatus.scheduled.value,
|
||||
"unknown": JobStatus.scheduled.value,
|
||||
}
|
||||
|
||||
|
||||
|
|
77
llama_stack/providers/remote/safety/nvidia/README.md
Normal file
77
llama_stack/providers/remote/safety/nvidia/README.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
# NVIDIA Safety Provider for LlamaStack
|
||||
|
||||
This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
|
||||
|
||||
## Features
|
||||
|
||||
- Run safety checks for messages
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- LlamaStack with NVIDIA configuration
|
||||
- Access to NVIDIA NeMo Guardrails service
|
||||
- NIM for model to use for safety check is deployed
|
||||
|
||||
### Setup
|
||||
|
||||
Build the NVIDIA environment:
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
```
|
||||
|
||||
### Basic Usage using the LlamaStack Python Client
|
||||
|
||||
#### Initialize the client
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ["NVIDIA_API_KEY"] = "your-api-key"
|
||||
os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
|
||||
|
||||
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||
|
||||
client = LlamaStackAsLibraryClient("nvidia")
|
||||
client.initialize()
|
||||
```
|
||||
|
||||
#### Create a safety shield
|
||||
|
||||
```python
|
||||
from llama_stack.apis.safety import Shield
|
||||
from llama_stack.apis.inference import Message
|
||||
|
||||
# Create a safety shield
|
||||
shield = Shield(
|
||||
shield_id="your-shield-id",
|
||||
provider_resource_id="safety-model-id", # The model to use for safety checks
|
||||
description="Safety checks for content moderation",
|
||||
)
|
||||
|
||||
# Register the shield
|
||||
await client.safety.register_shield(shield)
|
||||
```
|
||||
|
||||
#### Run safety checks
|
||||
|
||||
```python
|
||||
# Messages to check
|
||||
messages = [Message(role="user", content="Your message to check")]
|
||||
|
||||
# Run safety check
|
||||
response = await client.safety.run_shield(
|
||||
shield_id="your-shield-id",
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
# Check for violations
|
||||
if response.violation:
|
||||
print(f"Safety violation detected: {response.violation.user_message}")
|
||||
print(f"Violation level: {response.violation.violation_level}")
|
||||
print(f"Metadata: {response.violation.metadata}")
|
||||
else:
|
||||
print("No safety violations detected")
|
||||
```
|
|
@ -25,14 +25,84 @@ The following models are available by default:
|
|||
{% endif %}
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
## Prerequisites
|
||||
### NVIDIA API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
||||
### Inference: NVIDIA NIM
|
||||
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||
|
||||
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||
|
||||
### Datasetio API: NeMo Data Store
|
||||
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||
|
||||
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Eval API: NeMo Evaluator
|
||||
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Post-Training API: NeMo Customizer
|
||||
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Safety API: NeMo Guardrails
|
||||
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the NVIDIA Safety docs for supported features and example usage.
|
||||
|
||||
## Deploying models
|
||||
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||
|
||||
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||
```sh
|
||||
# URL to NeMo NIM Proxy service
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"name": "llama-3.2-1b-instruct",
|
||||
"namespace": "meta",
|
||||
"config": {
|
||||
"model": "meta/llama-3.2-1b-instruct",
|
||||
"nim_deployment": {
|
||||
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||
"image_tag": "1.8.3",
|
||||
"pvc_size": "25Gi",
|
||||
"gpu": 1,
|
||||
"additional_envs": {
|
||||
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||
```
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
|
@ -54,9 +124,27 @@ docker run \
|
|||
### Via Conda
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type venv
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Example Notebooks
|
||||
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||
|
|
|
@ -170,6 +170,16 @@ models:
|
|||
provider_id: nvidia
|
||||
provider_model_id: meta/llama-3.2-90b-vision-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta/llama-3.3-70b-instruct
|
||||
provider_id: nvidia
|
||||
provider_model_id: meta/llama-3.3-70b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.3-70B-Instruct
|
||||
provider_id: nvidia
|
||||
provider_model_id: meta/llama-3.3-70b-instruct
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 2048
|
||||
context_length: 8192
|
||||
|
|
|
@ -28,7 +28,7 @@ The following environment variables can be configured:
|
|||
|
||||
## Setting up vLLM server
|
||||
|
||||
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
|
||||
In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
|
||||
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
||||
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
||||
that we only use GPUs here for demonstration purposes.
|
||||
|
@ -149,6 +149,55 @@ docker run \
|
|||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
### Setting up vLLM server on Intel GPU
|
||||
|
||||
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
|
||||
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
|
||||
|
||||
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
|
||||
|
||||
```bash
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
|
||||
export ZE_AFFINITY_MASK=0
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
intel/vllm:xpu \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
```bash
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export ZE_AFFINITY_MASK=1
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
intel/vllm:xpu \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
|
|
@ -115,6 +115,70 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
|
|||
assert "I can't" in logs_str
|
||||
|
||||
|
||||
def test_agent_name(llama_stack_client, text_model_id):
|
||||
agent_name = f"test-agent-{uuid4()}"
|
||||
|
||||
try:
|
||||
agent = Agent(
|
||||
llama_stack_client,
|
||||
model=text_model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
name=agent_name,
|
||||
)
|
||||
except TypeError:
|
||||
agent = Agent(
|
||||
llama_stack_client,
|
||||
model=text_model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
)
|
||||
return
|
||||
|
||||
session_id = agent.create_session(f"test-session-{uuid4()}")
|
||||
|
||||
agent.create_turn(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Give me a sentence that contains the word: hello",
|
||||
}
|
||||
],
|
||||
session_id=session_id,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
all_spans = []
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[
|
||||
{"key": "session_id", "op": "eq", "value": session_id},
|
||||
],
|
||||
attributes_to_return=["input", "output", "agent_name", "agent_id", "session_id"],
|
||||
):
|
||||
all_spans.append(span.attributes)
|
||||
|
||||
agent_name_spans = []
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[],
|
||||
attributes_to_return=["agent_name"],
|
||||
):
|
||||
if "agent_name" in span.attributes:
|
||||
agent_name_spans.append(span.attributes)
|
||||
|
||||
agent_logs = []
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[
|
||||
{"key": "agent_name", "op": "eq", "value": agent_name},
|
||||
],
|
||||
attributes_to_return=["input", "output", "agent_name"],
|
||||
):
|
||||
if "output" in span.attributes and span.attributes["output"] != "no shields":
|
||||
agent_logs.append(span.attributes)
|
||||
|
||||
assert len(agent_logs) == 1
|
||||
assert agent_logs[0]["agent_name"] == agent_name
|
||||
assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
|
||||
assert "hello" in agent_logs[0]["output"].lower()
|
||||
|
||||
|
||||
def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
|
||||
common_params = dict(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
|
|
|
@ -31,6 +31,7 @@ def data_url_from_file(file_path: str) -> str:
|
|||
return data_url
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="flaky. Couldn't find 'llamastack/simpleqa' on the Hugging Face Hub")
|
||||
@pytest.mark.parametrize(
|
||||
"purpose, source, provider_id, limit",
|
||||
[
|
||||
|
|
38
tests/unit/distribution/test_build_path.py
Normal file
38
tests/unit/distribution/test_build_path.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.cli.stack._build import (
|
||||
_run_stack_build_command_from_build_config,
|
||||
)
|
||||
from llama_stack.distribution.datatypes import BuildConfig, DistributionSpec
|
||||
from llama_stack.distribution.utils.image_types import LlamaStackImageType
|
||||
|
||||
|
||||
def test_container_build_passes_path(monkeypatch, tmp_path):
|
||||
called_with = {}
|
||||
|
||||
def spy_build_image(cfg, build_file_path, image_name, template_or_config):
|
||||
called_with["path"] = template_or_config
|
||||
return 0
|
||||
|
||||
monkeypatch.setattr(
|
||||
"llama_stack.cli.stack._build.build_image",
|
||||
spy_build_image,
|
||||
raising=True,
|
||||
)
|
||||
|
||||
cfg = BuildConfig(
|
||||
image_type=LlamaStackImageType.CONTAINER.value,
|
||||
distribution_spec=DistributionSpec(providers={}, description=""),
|
||||
)
|
||||
|
||||
_run_stack_build_command_from_build_config(cfg, image_name="dummy")
|
||||
|
||||
assert "path" in called_with
|
||||
assert isinstance(called_with["path"], str)
|
||||
assert Path(called_with["path"]).exists()
|
|
@ -26,7 +26,12 @@ from openai.types.chat.chat_completion_chunk import (
|
|||
)
|
||||
from openai.types.model import Model as OpenAIModel
|
||||
|
||||
from llama_stack.apis.inference import ToolChoice, ToolConfig
|
||||
from llama_stack.apis.inference import (
|
||||
ChatCompletionRequest,
|
||||
ToolChoice,
|
||||
ToolConfig,
|
||||
UserMessage,
|
||||
)
|
||||
from llama_stack.apis.models import Model
|
||||
from llama_stack.models.llama.datatypes import StopReason
|
||||
from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig
|
||||
|
@ -232,3 +237,14 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
|
|||
# above.
|
||||
asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
|
||||
assert not asyncio_warnings
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_params_empty_tools(vllm_inference_adapter):
|
||||
request = ChatCompletionRequest(
|
||||
tools=[],
|
||||
model="test_model",
|
||||
messages=[UserMessage(content="test")],
|
||||
)
|
||||
params = await vllm_inference_adapter._get_params(request)
|
||||
assert "tools" not in params
|
||||
|
|
|
@ -200,35 +200,48 @@ class TestNvidiaPostTraining(unittest.TestCase):
|
|||
)
|
||||
|
||||
def test_get_training_job_status(self):
|
||||
self.mock_make_request.return_value = {
|
||||
"created_at": "2024-12-09T04:06:28.580220",
|
||||
"updated_at": "2024-12-09T04:21:19.852832",
|
||||
"status": "completed",
|
||||
"steps_completed": 1210,
|
||||
"epochs_completed": 2,
|
||||
"percentage_done": 100.0,
|
||||
"best_epoch": 2,
|
||||
"train_loss": 1.718016266822815,
|
||||
"val_loss": 1.8661999702453613,
|
||||
}
|
||||
customizer_status_to_job_status = [
|
||||
("running", "in_progress"),
|
||||
("completed", "completed"),
|
||||
("failed", "failed"),
|
||||
("cancelled", "cancelled"),
|
||||
("pending", "scheduled"),
|
||||
("unknown", "scheduled"),
|
||||
]
|
||||
|
||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||
for customizer_status, expected_status in customizer_status_to_job_status:
|
||||
with self.subTest(customizer_status=customizer_status, expected_status=expected_status):
|
||||
self.mock_make_request.return_value = {
|
||||
"created_at": "2024-12-09T04:06:28.580220",
|
||||
"updated_at": "2024-12-09T04:21:19.852832",
|
||||
"status": customizer_status,
|
||||
"steps_completed": 1210,
|
||||
"epochs_completed": 2,
|
||||
"percentage_done": 100.0,
|
||||
"best_epoch": 2,
|
||||
"train_loss": 1.718016266822815,
|
||||
"val_loss": 1.8661999702453613,
|
||||
}
|
||||
|
||||
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
|
||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||
|
||||
assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
|
||||
assert status.status.value == "completed"
|
||||
assert status.steps_completed == 1210
|
||||
assert status.epochs_completed == 2
|
||||
assert status.percentage_done == 100.0
|
||||
assert status.best_epoch == 2
|
||||
assert status.train_loss == 1.718016266822815
|
||||
assert status.val_loss == 1.8661999702453613
|
||||
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
|
||||
|
||||
self.mock_make_request.assert_called_once()
|
||||
self._assert_request(
|
||||
self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id}
|
||||
)
|
||||
assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
|
||||
assert status.status.value == expected_status
|
||||
assert status.steps_completed == 1210
|
||||
assert status.epochs_completed == 2
|
||||
assert status.percentage_done == 100.0
|
||||
assert status.best_epoch == 2
|
||||
assert status.train_loss == 1.718016266822815
|
||||
assert status.val_loss == 1.8661999702453613
|
||||
|
||||
self._assert_request(
|
||||
self.mock_make_request,
|
||||
"GET",
|
||||
f"/v1/customization/jobs/{job_id}/status",
|
||||
expected_params={"job_id": job_id},
|
||||
)
|
||||
|
||||
def test_get_training_jobs(self):
|
||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue